From a0eccf37c6538ac4ef64be22e4510a6255203525 Mon Sep 17 00:00:00 2001 From: Tucker Evans Date: Thu, 7 Dec 2017 15:51:37 -0500 Subject: CSC2621/assignments/search: Added funciton to remove certian nodes from html -Now script and noscript tags are removed from parseDoc() output. -Still an issue with missing spaces in output --- search/index.go | 86 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 25 deletions(-) diff --git a/search/index.go b/search/index.go index 7f202fb..0d60b8a 100644 --- a/search/index.go +++ b/search/index.go @@ -1,10 +1,11 @@ package main - import "os" +import "golang.org/x/net/html" import "log" import "fmt" import "github.com/PuerkitoBio/goquery" +import "github.com/kennygrant/sanitize" import "strings" import "flag" import "errors" @@ -15,41 +16,76 @@ type index struct { freq int; } -type document struct{ +type document struct { title []string; text []string; } - -func newDocument() (*document) { +func newDocument() *document { return &document{nil, nil}; } +func RemoveNode(r, rn *html.Node) { + var found bool; + var n, item *html.Node; + var nodes map[int]*html.Node; + var i, j int; + + found = false; + nodes = make(map[int]*html.Node); + + for n = r.FirstChild; n != nil; n = n.NextSibling { + if n == rn { + found = true; + n.Parent.RemoveChild(n); + } + + nodes[i] = n; + i++; + } + + if !found { + for j = 0; j < i; j++ { + item = nodes[j]; + RemoveNode(item, rn); + } + } +} +func RemoveTag(doc *goquery.Selection, tag string) { + doc.Find(tag).Each(func(i int, s *goquery.Selection) { + RemoveNode(doc.Get(0), s.Get(0)); + }); +} + func parseDoc(fd *os.File) (*document, error) { var err error; var text, t_text string; - var doc *goquery.Document; + var doc *goquery.Document; var body, title *goquery.Selection; var r_doc *document; - + doc, err = goquery.NewDocumentFromReader(fd); - if (err != nil) { + if err != nil { log.Printf("goquery error: %s\n", err); return nil, errors.New("Can't create goquery documnt"); } - + //TODO test kennygrant/sanatize instead of goquery - body = doc.Find("body").Not("style").Not("script"); + body = doc.Find("body"); + RemoveTag(body, "script"); + RemoveTag(body, "noscript"); + title = doc.Find("title"); - - text = body.Text(); - t_text = title.Text(); + + //TODO add error detection + text, err = body.Html(); + t_text, err = title.Html(); r_doc = newDocument(); - r_doc.text = strings.Fields(text); - r_doc.title = strings.Fields(t_text); - if (len(r_doc.text) == 1) { + r_doc.text = strings.Fields(sanitize.HTML(text)); + r_doc.title = strings.Fields(sanitize.HTML(t_text)); + if len(r_doc.text) == 1 { log.Printf("not splittin!!!!!!!!!!!\n"); os.Exit(1); } @@ -62,8 +98,8 @@ func init() { } func main() { -// var words map[string]index - var p_dir string//, fname string; + // var words map[string]index + var p_dir string //, fname string; var err error; var i int; @@ -77,34 +113,34 @@ func main() { flag.StringVar(&p_dir, "d", "./pages", "pages directory"); flag.Parse(); - + dir, err = os.Open(p_dir); - if (err != nil) { + if err != nil { log.Printf("Error accessing \"%s\":\t%s\n", p_dir, err); os.Exit(1); } dir_info, err = dir.Stat(); dir_mode = dir_info.Mode(); - - if (!dir_mode.IsDir()) { + + if !dir_mode.IsDir() { log.Printf("\"%s\" is not a directory\n", p_dir); os.Exit(1); } files, err = dir.Readdir(0); - if (err != nil) { + if err != nil { log.Printf("Error reading %s\n", p_dir); os.Exit(1); } - for i=0; i < len(files) && i < 1; i++ { + for i = 0; i < len(files) && i < 1; i++ { fd, err = os.Open(fmt.Sprintf("%s/%s", dir_info.Name(), files[i].Name())); - if (err != nil) { + if err != nil { log.Printf("Error reading %s/%s\n", dir_info.Name(), files[i].Name()); } else { doc, err = parseDoc(fd); - if (err != nil) { + if err != nil { log.Printf("Error parsing %s/%s\n", dir_info.Name(), files[i].Name()); } else { fmt.Println(doc.text); -- cgit v1.1