aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--search/.gitignore4
-rw-r--r--search/index.go18
2 files changed, 17 insertions, 5 deletions
diff --git a/search/.gitignore b/search/.gitignore
new file mode 100644
index 0000000..81a686a
--- /dev/null
+++ b/search/.gitignore
@@ -0,0 +1,4 @@
+*test*
+pages
+index
+search
diff --git a/search/index.go b/search/index.go
index 0d60b8a..9b1bc0c 100644
--- a/search/index.go
+++ b/search/index.go
@@ -9,6 +9,7 @@ import "github.com/kennygrant/sanitize"
import "strings"
import "flag"
import "errors"
+import "regexp"
type index struct {
doc string;
@@ -21,6 +22,8 @@ type document struct {
text []string;
}
+var r *regexp.Regexp;
+
func newDocument() *document {
return &document{nil, nil};
}
@@ -57,6 +60,11 @@ func RemoveTag(doc *goquery.Selection, tag string) {
});
}
+func logReg(h []byte) []byte {
+ log.Printf("RegExp: %s", h);
+ return h;
+}
+
func parseDoc(fd *os.File) (*document, error) {
var err error;
var text, t_text string;
@@ -70,7 +78,6 @@ func parseDoc(fd *os.File) (*document, error) {
return nil, errors.New("Can't create goquery documnt");
}
- //TODO test kennygrant/sanatize instead of goquery
body = doc.Find("body");
RemoveTag(body, "script");
RemoveTag(body, "noscript");
@@ -81,20 +88,21 @@ func parseDoc(fd *os.File) (*document, error) {
text, err = body.Html();
t_text, err = title.Html();
+
+ text = r.ReplaceAllString(text, "> <");
+ t_text = r.ReplaceAllString(text, "> <");
+
r_doc = newDocument();
r_doc.text = strings.Fields(sanitize.HTML(text));
r_doc.title = strings.Fields(sanitize.HTML(t_text));
- if len(r_doc.text) == 1 {
- log.Printf("not splittin!!!!!!!!!!!\n");
- os.Exit(1);
- }
return r_doc, nil;
}
func init() {
log.SetOutput(os.Stderr);
+ r, _ = regexp.Compile("><");
}
func main() {