aboutsummaryrefslogtreecommitdiff
path: root/CSC2636/search/indexer.go
diff options
context:
space:
mode:
Diffstat (limited to 'CSC2636/search/indexer.go')
-rw-r--r--CSC2636/search/indexer.go402
1 files changed, 402 insertions, 0 deletions
diff --git a/CSC2636/search/indexer.go b/CSC2636/search/indexer.go
new file mode 100644
index 0000000..d95f126
--- /dev/null
+++ b/CSC2636/search/indexer.go
@@ -0,0 +1,402 @@
+package main
+
+import "os"
+import "sort"
+import "golang.org/x/net/html"
+import "log"
+import "fmt"
+import "github.com/PuerkitoBio/goquery"
+import "github.com/kennygrant/sanitize"
+import "strings"
+import "flag"
+import "errors"
+import "regexp"
+
+type document struct {
+ fname string;
+ title []string;
+ text []string;
+ length int;
+}
+
+type index struct {
+ doc *document;
+ title bool;
+ freq int;
+}
+
+type wordSort struct {
+ w string;
+ root *wordList;
+}
+
+type wordList struct {
+ this *index
+ next *wordList
+}
+
+var r, nonAN *regexp.Regexp;
+var stopWords []*regexp.Regexp;
+
+
+func newDocument() *document {
+ return &document{"" , nil, nil, 0};
+}
+
+func RemoveNode(r, rn *html.Node) {
+ var found bool;
+ var n, item *html.Node;
+ var nodes map[int]*html.Node;
+ var i, j int;
+
+ found = false;
+ nodes = make(map[int]*html.Node);
+
+ for n = r.FirstChild; n != nil; n = n.NextSibling {
+ if n == rn {
+ found = true;
+ n.Parent.RemoveChild(n);
+ }
+
+ nodes[i] = n;
+ i++;
+ }
+
+ if !found {
+ for j = 0; j < i; j++ {
+ item = nodes[j];
+ RemoveNode(item, rn);
+ }
+ }
+}
+func RemoveTag(doc *goquery.Selection, tag string) {
+ doc.Find(tag).Each(func(i int, s *goquery.Selection) {
+ RemoveNode(doc.Get(0), s.Get(0));
+ });
+}
+
+func logReg(h []byte) []byte {
+ log.Printf("RegExp: %s", h);
+ return h;
+}
+
+func parseDoc(fd *os.File, f_info os.FileInfo) (*document, error) {
+ var err error;
+ var text, t_text string;
+ var doc *goquery.Document;
+ var body, title *goquery.Selection;
+ var r_doc *document;
+ var i int;
+
+ doc, err = goquery.NewDocumentFromReader(fd);
+ if err != nil {
+ log.Printf("goquery error: %s\n", err);
+ return nil, errors.New("Can't create goquery documnt");
+ }
+
+ body = doc.Find("body");
+ RemoveTag(body, "script");
+ RemoveTag(body, "noscript");
+
+ title = doc.Find("title");
+
+ //TODO add error detection
+ text, err = body.Html();
+ t_text, err = title.Html();
+
+
+ text = r.ReplaceAllString(text, "> <");
+ t_text = r.ReplaceAllString(t_text, "> <");
+
+ text = sanitize.HTML(text);
+ t_text = sanitize.HTML(t_text);
+
+ text = strings.ToLower(text);
+ t_text = strings.ToLower(t_text);
+
+ text = nonAN.ReplaceAllString(text, " ");
+ t_text = nonAN.ReplaceAllString(t_text, " ");
+
+
+ for i = 0; i < len(stopWords); i++ {
+ text = stopWords[i].ReplaceAllString(text, " ");
+ t_text = stopWords[i].ReplaceAllString(t_text, " ");
+ }
+ r_doc = newDocument();
+
+ r_doc.fname = f_info.Name();
+ r_doc.text = strings.Fields(text);
+ r_doc.title = strings.Fields(t_text);
+ r_doc.length = len(r_doc.text) + len(r_doc.title);
+
+ return r_doc, nil;
+}
+func boolToInt(t bool) int {
+ if t {
+ return 1;
+ }
+ return 0;
+}
+
+func printIndex(words []wordSort, fd *os.File) {
+ var i int;
+ var cur *wordList;
+ var fname string;
+ var t int;
+ var freq float64;
+
+ for i = 0; i < len(words); i++ {
+ fmt.Fprintf(fd, "%s\n", words[i].w);
+ for cur = words[i].root; cur != nil; cur = cur.next {
+ fname = cur.this.doc.fname;
+ t = boolToInt(cur.this.title);
+ freq = float64(cur.this.freq) / float64(cur.this.doc.length);
+
+ fmt.Fprintf(fd,"\t%s %d %.3f\n", fname, t, freq);
+ }
+ }
+}
+
+func init() {
+ var err error;
+ log.SetOutput(os.Stderr);
+ r, err = regexp.Compile("><");
+ if err != nil {
+ panic(err);
+ }
+ nonAN, err = regexp.Compile("[^a-zA-Z0-9]+");
+ if err != nil {
+ panic(err);
+ }
+ //TODO add func to read in stop words from a file;
+ stopWords = make([]*regexp.Regexp, 26)
+ if err != nil {
+ panic(err);
+ }
+ stopWords[0], err = regexp.Compile("\\W+and\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[1], err = regexp.Compile("\\W+a\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[2], err = regexp.Compile("\\W+an\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[3], err = regexp.Compile("\\W+and\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[4], err = regexp.Compile("\\W+are\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[5], err = regexp.Compile("\\W+as\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[6], err = regexp.Compile("\\W+at\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[7], err = regexp.Compile("\\W+be\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[8], err = regexp.Compile("\\W+by\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[9], err = regexp.Compile("\\W+for\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[10], err = regexp.Compile("\\W+from\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[11], err = regexp.Compile("\\W+has\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[12], err = regexp.Compile("\\W+he\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[13], err = regexp.Compile("\\W+in\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[14], err = regexp.Compile("\\W+is\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[15], err = regexp.Compile("\\W+it\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[16], err = regexp.Compile("\\W+its\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[17], err = regexp.Compile("\\W+of\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[18], err = regexp.Compile("\\W+on\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[19], err = regexp.Compile("\\W+that\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[20], err = regexp.Compile("\\W+the\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[21], err = regexp.Compile("\\W+to\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[22], err = regexp.Compile("\\W+was\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[23], err = regexp.Compile("\\W+were\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[24], err = regexp.Compile("\\W+will\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[25], err = regexp.Compile("\\W+with\\W+");
+ if err != nil {
+ panic(err);
+ }
+}
+
+func main() {
+ // var words map[string]index
+ var p_dir, w, fname string;
+ var err error;
+ var i, j int;
+ var words map[string]*wordList;
+ var cur *wordList;
+ var tmp *index;
+ var sorted []wordSort;
+
+ var files []os.FileInfo;
+ var dir, fd *os.File;
+ var dir_info, fd_info os.FileInfo;
+ var dir_mode os.FileMode;
+
+ var doc *document;
+
+ flag.StringVar(&p_dir, "d", "./pages", "pages directory");
+
+ flag.Parse();
+
+ words = make(map[string]*wordList);
+
+ dir, err = os.Open(p_dir);
+ if err != nil {
+ log.Printf("Error accessing \"%s\":\t%s\n", p_dir, err);
+ os.Exit(1);
+ }
+
+ dir_info, err = dir.Stat();
+ dir_mode = dir_info.Mode();
+
+ if !dir_mode.IsDir() {
+ log.Printf("\"%s\" is not a directory\n", p_dir);
+ os.Exit(1);
+ }
+
+ files, err = dir.Readdir(0);
+ if err != nil {
+ log.Printf("Error reading %s\n", p_dir);
+ os.Exit(1);
+ }
+
+ for i = 0; i < len(files); i++ {
+ fd, err = os.Open(fmt.Sprintf("%s/%s", dir_info.Name(), files[i].Name()));
+ fd_info, err = fd.Stat();
+ if err != nil {
+ log.Printf("Error getting info\n");
+ os.Exit(1);
+ }
+ fname = fd_info.Name();
+
+ if err != nil {
+ log.Printf("Error reading %s/%s\n", dir_info.Name(), files[i].Name());
+ } else {
+ fmt.Printf("Indexing %s...\n", fname);
+ doc, err = parseDoc(fd, fd_info);
+ if err != nil {
+ log.Printf("Error parsing %s/%s\n", dir_info.Name(), files[i].Name());
+ } else {
+ /* Text */
+ for j = 0; j < len(doc.text); j++ {
+ w = strings.ToLower(doc.text[j]);
+
+ if words[w] == nil{
+ tmp = &index{doc: doc, title: false, freq: 0};
+ words[w] = &wordList{this: tmp, next: nil};
+ }
+
+ for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{}
+
+ if cur.this.doc.fname == fname {
+ cur.this.freq++
+ } else if cur.next == nil {
+ tmp = &index{doc: doc, title: false, freq: 1};
+ cur.next = &wordList{this: tmp, next: nil};
+ } else {
+ panic(fmt.Sprintf("%v", cur));
+ }
+ }
+ /* Title */
+ for j = 0; j < len(doc.title); j++ {
+ w = strings.ToLower(doc.title[j]);
+
+ if words[w] == nil{
+ tmp = &index{doc: doc, title: true, freq: 0};
+ words[w] = &wordList{this: tmp, next: nil};
+ }
+
+ for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{}
+
+ if cur.this.doc.fname == fname {
+ cur.this.title = true;
+ cur.this.freq++;
+ } else if cur.next == nil {
+ tmp = &index{doc: doc, title: true, freq: 1};
+ cur.next = &wordList{this: tmp, next: nil};
+ } else {
+ panic(fmt.Sprintf("%v", cur));
+ }
+ }
+ }
+ }
+ fd.Close();
+ }
+ sorted = make([]wordSort, len(words));
+ i = 0;
+ for k,v := range words {
+ sorted[i].w = k;
+ sorted[i].root = v;
+ i++;
+ }
+
+ sort.Slice(sorted, func(i, j int) bool {
+ return sorted[i].w < sorted[j].w;
+ });
+
+ fd,_ = os.Create("index.dat");
+ printIndex(sorted, fd);
+ fd.Close();
+}