From 204113558a3d2764fbdc4f8643d63ad5e6c330e1 Mon Sep 17 00:00:00 2001 From: Tucker Evans Date: Sat, 9 Dec 2017 16:17:41 -0500 Subject: CSC2621/assignments/search: Fixed parseDoc func; Added .gitignore --- search/.gitignore | 4 ++++ search/index.go | 18 +++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) create mode 100644 search/.gitignore diff --git a/search/.gitignore b/search/.gitignore new file mode 100644 index 0000000..81a686a --- /dev/null +++ b/search/.gitignore @@ -0,0 +1,4 @@ +*test* +pages +index +search diff --git a/search/index.go b/search/index.go index 0d60b8a..9b1bc0c 100644 --- a/search/index.go +++ b/search/index.go @@ -9,6 +9,7 @@ import "github.com/kennygrant/sanitize" import "strings" import "flag" import "errors" +import "regexp" type index struct { doc string; @@ -21,6 +22,8 @@ type document struct { text []string; } +var r *regexp.Regexp; + func newDocument() *document { return &document{nil, nil}; } @@ -57,6 +60,11 @@ func RemoveTag(doc *goquery.Selection, tag string) { }); } +func logReg(h []byte) []byte { + log.Printf("RegExp: %s", h); + return h; +} + func parseDoc(fd *os.File) (*document, error) { var err error; var text, t_text string; @@ -70,7 +78,6 @@ func parseDoc(fd *os.File) (*document, error) { return nil, errors.New("Can't create goquery documnt"); } - //TODO test kennygrant/sanatize instead of goquery body = doc.Find("body"); RemoveTag(body, "script"); RemoveTag(body, "noscript"); @@ -81,20 +88,21 @@ func parseDoc(fd *os.File) (*document, error) { text, err = body.Html(); t_text, err = title.Html(); + + text = r.ReplaceAllString(text, "> <"); + t_text = r.ReplaceAllString(text, "> <"); + r_doc = newDocument(); r_doc.text = strings.Fields(sanitize.HTML(text)); r_doc.title = strings.Fields(sanitize.HTML(t_text)); - if len(r_doc.text) == 1 { - log.Printf("not splittin!!!!!!!!!!!\n"); - os.Exit(1); - } return r_doc, nil; } func init() { log.SetOutput(os.Stderr); + r, _ = regexp.Compile("><"); } func main() { -- cgit v1.1