From e8b1808eaf87a49e4c34ebbfb66854baa627418c Mon Sep 17 00:00:00 2001 From: Tucker Evans Date: Mon, 18 Feb 2019 07:35:54 -0500 Subject: Moves assignments to given course folder. --- CSC2636/search/indexer.go | 402 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 402 insertions(+) create mode 100644 CSC2636/search/indexer.go (limited to 'CSC2636/search/indexer.go') diff --git a/CSC2636/search/indexer.go b/CSC2636/search/indexer.go new file mode 100644 index 0000000..d95f126 --- /dev/null +++ b/CSC2636/search/indexer.go @@ -0,0 +1,402 @@ +package main + +import "os" +import "sort" +import "golang.org/x/net/html" +import "log" +import "fmt" +import "github.com/PuerkitoBio/goquery" +import "github.com/kennygrant/sanitize" +import "strings" +import "flag" +import "errors" +import "regexp" + +type document struct { + fname string; + title []string; + text []string; + length int; +} + +type index struct { + doc *document; + title bool; + freq int; +} + +type wordSort struct { + w string; + root *wordList; +} + +type wordList struct { + this *index + next *wordList +} + +var r, nonAN *regexp.Regexp; +var stopWords []*regexp.Regexp; + + +func newDocument() *document { + return &document{"" , nil, nil, 0}; +} + +func RemoveNode(r, rn *html.Node) { + var found bool; + var n, item *html.Node; + var nodes map[int]*html.Node; + var i, j int; + + found = false; + nodes = make(map[int]*html.Node); + + for n = r.FirstChild; n != nil; n = n.NextSibling { + if n == rn { + found = true; + n.Parent.RemoveChild(n); + } + + nodes[i] = n; + i++; + } + + if !found { + for j = 0; j < i; j++ { + item = nodes[j]; + RemoveNode(item, rn); + } + } +} +func RemoveTag(doc *goquery.Selection, tag string) { + doc.Find(tag).Each(func(i int, s *goquery.Selection) { + RemoveNode(doc.Get(0), s.Get(0)); + }); +} + +func logReg(h []byte) []byte { + log.Printf("RegExp: %s", h); + return h; +} + +func parseDoc(fd *os.File, f_info os.FileInfo) (*document, error) { + var err error; + var text, t_text string; + var doc *goquery.Document; + var body, title *goquery.Selection; + var r_doc *document; + var i int; + + doc, err = goquery.NewDocumentFromReader(fd); + if err != nil { + log.Printf("goquery error: %s\n", err); + return nil, errors.New("Can't create goquery documnt"); + } + + body = doc.Find("body"); + RemoveTag(body, "script"); + RemoveTag(body, "noscript"); + + title = doc.Find("title"); + + //TODO add error detection + text, err = body.Html(); + t_text, err = title.Html(); + + + text = r.ReplaceAllString(text, "> <"); + t_text = r.ReplaceAllString(t_text, "> <"); + + text = sanitize.HTML(text); + t_text = sanitize.HTML(t_text); + + text = strings.ToLower(text); + t_text = strings.ToLower(t_text); + + text = nonAN.ReplaceAllString(text, " "); + t_text = nonAN.ReplaceAllString(t_text, " "); + + + for i = 0; i < len(stopWords); i++ { + text = stopWords[i].ReplaceAllString(text, " "); + t_text = stopWords[i].ReplaceAllString(t_text, " "); + } + r_doc = newDocument(); + + r_doc.fname = f_info.Name(); + r_doc.text = strings.Fields(text); + r_doc.title = strings.Fields(t_text); + r_doc.length = len(r_doc.text) + len(r_doc.title); + + return r_doc, nil; +} +func boolToInt(t bool) int { + if t { + return 1; + } + return 0; +} + +func printIndex(words []wordSort, fd *os.File) { + var i int; + var cur *wordList; + var fname string; + var t int; + var freq float64; + + for i = 0; i < len(words); i++ { + fmt.Fprintf(fd, "%s\n", words[i].w); + for cur = words[i].root; cur != nil; cur = cur.next { + fname = cur.this.doc.fname; + t = boolToInt(cur.this.title); + freq = float64(cur.this.freq) / float64(cur.this.doc.length); + + fmt.Fprintf(fd,"\t%s %d %.3f\n", fname, t, freq); + } + } +} + +func init() { + var err error; + log.SetOutput(os.Stderr); + r, err = regexp.Compile("><"); + if err != nil { + panic(err); + } + nonAN, err = regexp.Compile("[^a-zA-Z0-9]+"); + if err != nil { + panic(err); + } + //TODO add func to read in stop words from a file; + stopWords = make([]*regexp.Regexp, 26) + if err != nil { + panic(err); + } + stopWords[0], err = regexp.Compile("\\W+and\\W+"); + if err != nil { + panic(err); + } + stopWords[1], err = regexp.Compile("\\W+a\\W+"); + if err != nil { + panic(err); + } + stopWords[2], err = regexp.Compile("\\W+an\\W+"); + if err != nil { + panic(err); + } + stopWords[3], err = regexp.Compile("\\W+and\\W+"); + if err != nil { + panic(err); + } + stopWords[4], err = regexp.Compile("\\W+are\\W+"); + if err != nil { + panic(err); + } + stopWords[5], err = regexp.Compile("\\W+as\\W+"); + if err != nil { + panic(err); + } + stopWords[6], err = regexp.Compile("\\W+at\\W+"); + if err != nil { + panic(err); + } + stopWords[7], err = regexp.Compile("\\W+be\\W+"); + if err != nil { + panic(err); + } + stopWords[8], err = regexp.Compile("\\W+by\\W+"); + if err != nil { + panic(err); + } + stopWords[9], err = regexp.Compile("\\W+for\\W+"); + if err != nil { + panic(err); + } + stopWords[10], err = regexp.Compile("\\W+from\\W+"); + if err != nil { + panic(err); + } + stopWords[11], err = regexp.Compile("\\W+has\\W+"); + if err != nil { + panic(err); + } + stopWords[12], err = regexp.Compile("\\W+he\\W+"); + if err != nil { + panic(err); + } + stopWords[13], err = regexp.Compile("\\W+in\\W+"); + if err != nil { + panic(err); + } + stopWords[14], err = regexp.Compile("\\W+is\\W+"); + if err != nil { + panic(err); + } + stopWords[15], err = regexp.Compile("\\W+it\\W+"); + if err != nil { + panic(err); + } + stopWords[16], err = regexp.Compile("\\W+its\\W+"); + if err != nil { + panic(err); + } + stopWords[17], err = regexp.Compile("\\W+of\\W+"); + if err != nil { + panic(err); + } + stopWords[18], err = regexp.Compile("\\W+on\\W+"); + if err != nil { + panic(err); + } + stopWords[19], err = regexp.Compile("\\W+that\\W+"); + if err != nil { + panic(err); + } + stopWords[20], err = regexp.Compile("\\W+the\\W+"); + if err != nil { + panic(err); + } + stopWords[21], err = regexp.Compile("\\W+to\\W+"); + if err != nil { + panic(err); + } + stopWords[22], err = regexp.Compile("\\W+was\\W+"); + if err != nil { + panic(err); + } + stopWords[23], err = regexp.Compile("\\W+were\\W+"); + if err != nil { + panic(err); + } + stopWords[24], err = regexp.Compile("\\W+will\\W+"); + if err != nil { + panic(err); + } + stopWords[25], err = regexp.Compile("\\W+with\\W+"); + if err != nil { + panic(err); + } +} + +func main() { + // var words map[string]index + var p_dir, w, fname string; + var err error; + var i, j int; + var words map[string]*wordList; + var cur *wordList; + var tmp *index; + var sorted []wordSort; + + var files []os.FileInfo; + var dir, fd *os.File; + var dir_info, fd_info os.FileInfo; + var dir_mode os.FileMode; + + var doc *document; + + flag.StringVar(&p_dir, "d", "./pages", "pages directory"); + + flag.Parse(); + + words = make(map[string]*wordList); + + dir, err = os.Open(p_dir); + if err != nil { + log.Printf("Error accessing \"%s\":\t%s\n", p_dir, err); + os.Exit(1); + } + + dir_info, err = dir.Stat(); + dir_mode = dir_info.Mode(); + + if !dir_mode.IsDir() { + log.Printf("\"%s\" is not a directory\n", p_dir); + os.Exit(1); + } + + files, err = dir.Readdir(0); + if err != nil { + log.Printf("Error reading %s\n", p_dir); + os.Exit(1); + } + + for i = 0; i < len(files); i++ { + fd, err = os.Open(fmt.Sprintf("%s/%s", dir_info.Name(), files[i].Name())); + fd_info, err = fd.Stat(); + if err != nil { + log.Printf("Error getting info\n"); + os.Exit(1); + } + fname = fd_info.Name(); + + if err != nil { + log.Printf("Error reading %s/%s\n", dir_info.Name(), files[i].Name()); + } else { + fmt.Printf("Indexing %s...\n", fname); + doc, err = parseDoc(fd, fd_info); + if err != nil { + log.Printf("Error parsing %s/%s\n", dir_info.Name(), files[i].Name()); + } else { + /* Text */ + for j = 0; j < len(doc.text); j++ { + w = strings.ToLower(doc.text[j]); + + if words[w] == nil{ + tmp = &index{doc: doc, title: false, freq: 0}; + words[w] = &wordList{this: tmp, next: nil}; + } + + for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{} + + if cur.this.doc.fname == fname { + cur.this.freq++ + } else if cur.next == nil { + tmp = &index{doc: doc, title: false, freq: 1}; + cur.next = &wordList{this: tmp, next: nil}; + } else { + panic(fmt.Sprintf("%v", cur)); + } + } + /* Title */ + for j = 0; j < len(doc.title); j++ { + w = strings.ToLower(doc.title[j]); + + if words[w] == nil{ + tmp = &index{doc: doc, title: true, freq: 0}; + words[w] = &wordList{this: tmp, next: nil}; + } + + for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{} + + if cur.this.doc.fname == fname { + cur.this.title = true; + cur.this.freq++; + } else if cur.next == nil { + tmp = &index{doc: doc, title: true, freq: 1}; + cur.next = &wordList{this: tmp, next: nil}; + } else { + panic(fmt.Sprintf("%v", cur)); + } + } + } + } + fd.Close(); + } + sorted = make([]wordSort, len(words)); + i = 0; + for k,v := range words { + sorted[i].w = k; + sorted[i].root = v; + i++; + } + + sort.Slice(sorted, func(i, j int) bool { + return sorted[i].w < sorted[j].w; + }); + + fd,_ = os.Create("index.dat"); + printIndex(sorted, fd); + fd.Close(); +} -- cgit v1.1