aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTucker Evans <tuckerevans24@gmail.com>2017-12-07 15:51:37 -0500
committerTucker Evans <tuckerevans24@gmail.com>2017-12-17 13:20:31 -0500
commita0eccf37c6538ac4ef64be22e4510a6255203525 (patch)
tree62d99eef8be880313426989d802cfaf655e693c7
parent1a2153af95331cd77803d40633f581e1f2b6507e (diff)
CSC2621/assignments/search: Added funciton to remove certian nodes from html
-Now script and noscript tags are removed from parseDoc() output. -Still an issue with missing spaces in output
-rw-r--r--search/index.go86
1 files changed, 61 insertions, 25 deletions
diff --git a/search/index.go b/search/index.go
index 7f202fb..0d60b8a 100644
--- a/search/index.go
+++ b/search/index.go
@@ -1,10 +1,11 @@
package main
-
import "os"
+import "golang.org/x/net/html"
import "log"
import "fmt"
import "github.com/PuerkitoBio/goquery"
+import "github.com/kennygrant/sanitize"
import "strings"
import "flag"
import "errors"
@@ -15,41 +16,76 @@ type index struct {
freq int;
}
-type document struct{
+type document struct {
title []string;
text []string;
}
-
-func newDocument() (*document) {
+func newDocument() *document {
return &document{nil, nil};
}
+func RemoveNode(r, rn *html.Node) {
+ var found bool;
+ var n, item *html.Node;
+ var nodes map[int]*html.Node;
+ var i, j int;
+
+ found = false;
+ nodes = make(map[int]*html.Node);
+
+ for n = r.FirstChild; n != nil; n = n.NextSibling {
+ if n == rn {
+ found = true;
+ n.Parent.RemoveChild(n);
+ }
+
+ nodes[i] = n;
+ i++;
+ }
+
+ if !found {
+ for j = 0; j < i; j++ {
+ item = nodes[j];
+ RemoveNode(item, rn);
+ }
+ }
+}
+func RemoveTag(doc *goquery.Selection, tag string) {
+ doc.Find(tag).Each(func(i int, s *goquery.Selection) {
+ RemoveNode(doc.Get(0), s.Get(0));
+ });
+}
+
func parseDoc(fd *os.File) (*document, error) {
var err error;
var text, t_text string;
- var doc *goquery.Document;
+ var doc *goquery.Document;
var body, title *goquery.Selection;
var r_doc *document;
-
+
doc, err = goquery.NewDocumentFromReader(fd);
- if (err != nil) {
+ if err != nil {
log.Printf("goquery error: %s\n", err);
return nil, errors.New("Can't create goquery documnt");
}
-
+
//TODO test kennygrant/sanatize instead of goquery
- body = doc.Find("body").Not("style").Not("script");
+ body = doc.Find("body");
+ RemoveTag(body, "script");
+ RemoveTag(body, "noscript");
+
title = doc.Find("title");
-
- text = body.Text();
- t_text = title.Text();
+
+ //TODO add error detection
+ text, err = body.Html();
+ t_text, err = title.Html();
r_doc = newDocument();
- r_doc.text = strings.Fields(text);
- r_doc.title = strings.Fields(t_text);
- if (len(r_doc.text) == 1) {
+ r_doc.text = strings.Fields(sanitize.HTML(text));
+ r_doc.title = strings.Fields(sanitize.HTML(t_text));
+ if len(r_doc.text) == 1 {
log.Printf("not splittin!!!!!!!!!!!\n");
os.Exit(1);
}
@@ -62,8 +98,8 @@ func init() {
}
func main() {
-// var words map[string]index
- var p_dir string//, fname string;
+ // var words map[string]index
+ var p_dir string //, fname string;
var err error;
var i int;
@@ -77,34 +113,34 @@ func main() {
flag.StringVar(&p_dir, "d", "./pages", "pages directory");
flag.Parse();
-
+
dir, err = os.Open(p_dir);
- if (err != nil) {
+ if err != nil {
log.Printf("Error accessing \"%s\":\t%s\n", p_dir, err);
os.Exit(1);
}
dir_info, err = dir.Stat();
dir_mode = dir_info.Mode();
-
- if (!dir_mode.IsDir()) {
+
+ if !dir_mode.IsDir() {
log.Printf("\"%s\" is not a directory\n", p_dir);
os.Exit(1);
}
files, err = dir.Readdir(0);
- if (err != nil) {
+ if err != nil {
log.Printf("Error reading %s\n", p_dir);
os.Exit(1);
}
- for i=0; i < len(files) && i < 1; i++ {
+ for i = 0; i < len(files) && i < 1; i++ {
fd, err = os.Open(fmt.Sprintf("%s/%s", dir_info.Name(), files[i].Name()));
- if (err != nil) {
+ if err != nil {
log.Printf("Error reading %s/%s\n", dir_info.Name(), files[i].Name());
} else {
doc, err = parseDoc(fd);
- if (err != nil) {
+ if err != nil {
log.Printf("Error parsing %s/%s\n", dir_info.Name(), files[i].Name());
} else {
fmt.Println(doc.text);