From a0eccf37c6538ac4ef64be22e4510a6255203525 Mon Sep 17 00:00:00 2001
From: Tucker Evans <tuckerevans24@gmail.com>
Date: Thu, 7 Dec 2017 15:51:37 -0500
Subject: CSC2621/assignments/search: Added funciton to remove certian nodes
 from html

-Now script and noscript tags are removed from parseDoc() output.
-Still an issue with missing spaces in output
---
 search/index.go | 86 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 61 insertions(+), 25 deletions(-)

(limited to 'search')

diff --git a/search/index.go b/search/index.go
index 7f202fb..0d60b8a 100644
--- a/search/index.go
+++ b/search/index.go
@@ -1,10 +1,11 @@
 package main
 
-
 import "os"
+import "golang.org/x/net/html"
 import "log"
 import "fmt"
 import "github.com/PuerkitoBio/goquery"
+import "github.com/kennygrant/sanitize"
 import "strings"
 import "flag"
 import "errors"
@@ -15,41 +16,76 @@ type index struct {
 	freq int;
 }
 
-type document struct{
+type document struct {
 	title []string;
 	text []string;
 }
 
-
-func newDocument() (*document) {
+func newDocument() *document {
 	return &document{nil, nil};
 }
 
+func RemoveNode(r, rn *html.Node) {
+	var found bool;
+	var n, item *html.Node;
+	var nodes map[int]*html.Node;
+	var i, j int;
+
+	found = false;
+	nodes = make(map[int]*html.Node);
+
+	for n = r.FirstChild; n != nil; n = n.NextSibling {
+		if n == rn {
+			found = true;
+			n.Parent.RemoveChild(n);
+		}
+
+		nodes[i] = n;
+		i++;
+	}
+
+	if !found {
+		for j = 0; j < i; j++ {
+			item = nodes[j];
+			RemoveNode(item, rn);
+		}
+	}
+}
+func RemoveTag(doc *goquery.Selection, tag string) {
+	doc.Find(tag).Each(func(i int, s *goquery.Selection) {
+		RemoveNode(doc.Get(0), s.Get(0));
+	});
+}
+
 func parseDoc(fd *os.File) (*document, error) {
 	var err error;
 	var text, t_text string;
-	var doc *goquery.Document; 
+	var doc *goquery.Document;
 	var body, title *goquery.Selection;
 	var r_doc *document;
-	
+
 	doc, err = goquery.NewDocumentFromReader(fd);
-	if (err != nil) {
+	if err != nil {
 		log.Printf("goquery error: %s\n", err);
 		return nil, errors.New("Can't create goquery documnt");
 	}
-	
+
 	//TODO test kennygrant/sanatize instead of goquery
-	body = doc.Find("body").Not("style").Not("script");
+	body = doc.Find("body");
+	RemoveTag(body, "script");
+	RemoveTag(body, "noscript");
+
 	title = doc.Find("title");
-	
-	text = body.Text();
-	t_text = title.Text();
+
+	//TODO add error detection
+	text, err = body.Html();
+	t_text, err = title.Html();
 
 	r_doc = newDocument();
 
-	r_doc.text = strings.Fields(text);
-	r_doc.title = strings.Fields(t_text);
-	if (len(r_doc.text) == 1) {
+	r_doc.text = strings.Fields(sanitize.HTML(text));
+	r_doc.title = strings.Fields(sanitize.HTML(t_text));
+	if len(r_doc.text) == 1 {
 		log.Printf("not splittin!!!!!!!!!!!\n");
 		os.Exit(1);
 	}
@@ -62,8 +98,8 @@ func init() {
 }
 
 func main() {
-//	var words map[string]index
-	var p_dir string//, fname string;
+	//	var words map[string]index
+	var p_dir string //, fname string;
 	var err error;
 	var i int;
 
@@ -77,34 +113,34 @@ func main() {
 	flag.StringVar(&p_dir, "d", "./pages", "pages directory");
 
 	flag.Parse();
-	
+
 	dir, err = os.Open(p_dir);
-	if (err != nil) {
+	if err != nil {
 		log.Printf("Error accessing \"%s\":\t%s\n", p_dir, err);
 		os.Exit(1);
 	}
 
 	dir_info, err = dir.Stat();
 	dir_mode = dir_info.Mode();
-	
-	if (!dir_mode.IsDir()) {
+
+	if !dir_mode.IsDir() {
 		log.Printf("\"%s\" is not a directory\n", p_dir);
 		os.Exit(1);
 	}
 
 	files, err = dir.Readdir(0);
-	if (err != nil) {
+	if err != nil {
 		log.Printf("Error reading %s\n", p_dir);
 		os.Exit(1);
 	}
 
-	for i=0; i < len(files) && i < 1; i++ {
+	for i = 0; i < len(files) && i < 1; i++ {
 		fd, err = os.Open(fmt.Sprintf("%s/%s", dir_info.Name(), files[i].Name()));
-		if (err != nil) {
+		if err != nil {
 			log.Printf("Error reading %s/%s\n", dir_info.Name(), files[i].Name());
 		} else {
 			doc, err = parseDoc(fd);
-			if (err != nil) {
+			if err != nil {
 				log.Printf("Error parsing %s/%s\n", dir_info.Name(), files[i].Name());
 			} else {
 				fmt.Println(doc.text);
-- 
cgit v1.1