From 01e350a1d72a9633a9fb3194422a689a5d572aaf Mon Sep 17 00:00:00 2001 From: Tucker Evans Date: Sun, 17 Dec 2017 05:54:06 -0500 Subject: CSC2621/assignments/search: Implemented search; Fixed Indexer frequency vals; --- search/index/index.go | 66 +++++++++++----------- search/indexer.go | 152 +++++++++++++++++++++++++++++++++++++++++++++----- search/search.go | 127 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 299 insertions(+), 46 deletions(-) (limited to 'search') diff --git a/search/index/index.go b/search/index/index.go index 81b50e0..5d8ab65 100644 --- a/search/index/index.go +++ b/search/index/index.go @@ -19,34 +19,34 @@ import "strconv" * Types * *********/ -type f_info struct { - word string; - in_title bool; - freq float64; +type F_info struct { + Word string; + In_title bool; + Freq float64; }; -type i_info struct { - doc string; - in_title bool; - freq float64; +type I_info struct { + Doc string; + In_title bool; + Freq float64; }; -type f_entry struct{ - this *f_info; - next *f_entry; +type F_entry struct{ + This *F_info; + Next *F_entry; }; -type i_entry struct{ - this *i_info; - next *i_entry; +type I_entry struct{ + This *I_info; + Next *I_entry; }; -type F_index map[string]*f_entry; -type I_index map[string]*i_entry; +type F_index map[string]*F_entry; +type I_index map[string]*I_entry; type sortInverted struct{ w string; - root *i_entry; + root *I_entry; }; @@ -54,7 +54,7 @@ type sortInverted struct{ * Forward Index Funcitons * ***************************/ -func NewForwardEntryStrings(text, title []string) (*f_entry, error) { +func NewForwardEntryStrings(text, title []string) (*F_entry, error) { return nil, errors.New("not implemented"); } @@ -62,8 +62,8 @@ func NewForwardEntryStrings(text, title []string) (*f_entry, error) { * Inverted Index Functions * ****************************/ -func new_i_info() *i_info{ - return &i_info{"", false, 0.0}; +func new_I_info() *I_info{ + return &I_info{"", false, 0.0}; } func NewInvertedIndexFromFile(fname string) (I_index, error) { @@ -71,8 +71,8 @@ func NewInvertedIndexFromFile(fname string) (I_index, error) { var br *bufio.Reader; var err error; var buf []byte; - var tmp *i_info; - var cur *i_entry; + var tmp *I_info; + var cur *I_entry; var index I_index; var word string var info []string; @@ -90,7 +90,7 @@ func NewInvertedIndexFromFile(fname string) (I_index, error) { index = make(I_index); for buf, err = br.ReadBytes('\n'); err != io.EOF; buf, err = br.ReadBytes('\n'){ - tmp = new_i_info(); + tmp = new_I_info(); if err != nil { return nil, err; } @@ -98,17 +98,17 @@ func NewInvertedIndexFromFile(fname string) (I_index, error) { word = strings.TrimSpace(string(buf)); } else { info = strings.Fields(string(buf)); - tmp.doc = info[0]; - tmp.in_title = (info[1] == "1"); - tmp.freq, _ = strconv.ParseFloat(info[2], 32); + tmp.Doc = info[0]; + tmp.In_title = (info[1] == "1"); + tmp.Freq, _ = strconv.ParseFloat(info[2], 32); if (index[word] == nil) { - index[word] = &i_entry{this: tmp, next: nil}; + index[word] = &I_entry{This: tmp, Next: nil}; } else { cur = index[word]; - for cur.next != nil { - cur = cur.next; + for cur.Next != nil { + cur = cur.Next; } - cur.next = &i_entry{this: tmp, next: nil}; + cur.Next = &I_entry{This: tmp, Next: nil}; } } } @@ -123,15 +123,15 @@ func NewInvertedFromForward(f F_index) (I_index, error) { func (x I_index) PrintToFile(fd *os.File) error{ var i int; - var cur *i_entry; + var cur *I_entry; var index []sortInverted; index = x.sortIndex(); for i = 0; i < len(index); i++ { fmt.Fprintf(fd, "%s\n", index[i].w); - for cur = index[i].root; cur != nil; cur = cur.next { - fmt.Fprintf(fd, "\t%s %d %.3f\n", cur.this.doc, toInt(cur.this.in_title), cur.this.freq); + for cur = index[i].root; cur != nil; cur = cur.Next { + fmt.Fprintf(fd, "\t%s %d %.3f\n", cur.This.Doc, toInt(cur.This.In_title), cur.This.Freq); } } return nil; diff --git a/search/indexer.go b/search/indexer.go index ee36c9a..9550c86 100644 --- a/search/indexer.go +++ b/search/indexer.go @@ -16,6 +16,7 @@ type document struct { fname string; title []string; text []string; + length int; } type index struct { @@ -34,11 +35,12 @@ type wordList struct { next *wordList } -var r, nonAN, stopWords *regexp.Regexp; +var r, nonAN *regexp.Regexp; +var stopWords []*regexp.Regexp; func newDocument() *document { - return &document{"" , nil, nil}; + return &document{"" , nil, nil, 0}; } func RemoveNode(r, rn *html.Node) { @@ -84,6 +86,7 @@ func parseDoc(fd *os.File, f_info os.FileInfo) (*document, error) { var doc *goquery.Document; var body, title *goquery.Selection; var r_doc *document; + var i int; doc, err = goquery.NewDocumentFromReader(fd); if err != nil { @@ -103,21 +106,29 @@ func parseDoc(fd *os.File, f_info os.FileInfo) (*document, error) { text = r.ReplaceAllString(text, "> <"); - t_text = r.ReplaceAllString(text, "> <"); + t_text = r.ReplaceAllString(t_text, "> <"); text = sanitize.HTML(text); t_text = sanitize.HTML(t_text); + text = strings.ToLower(text); + t_text = strings.ToLower(t_text); + text = nonAN.ReplaceAllString(text, " "); t_text = nonAN.ReplaceAllString(t_text, " "); - text = stopWords.ReplaceAllString(text, ""); - t_text = stopWords.ReplaceAllString(t_text, ""); + for i = 0; i < len(stopWords); i++ { + text = stopWords[i].ReplaceAllString(text, " "); + t_text = stopWords[i].ReplaceAllString(t_text, " "); + } r_doc = newDocument(); + r_doc.fname = f_info.Name(); - r_doc.text = strings.Fields(sanitize.HTML(text)); - r_doc.title = strings.Fields(sanitize.HTML(t_text)); + r_doc.text = strings.Fields(text); + r_doc.title = strings.Fields(t_text); + r_doc.length = len(r_doc.text) + len(r_doc.title); + fmt.Println(r_doc.length) return r_doc, nil; } @@ -140,7 +151,7 @@ func printIndex(words []wordSort, fd *os.File) { for cur = words[i].root; cur != nil; cur = cur.next { fname = cur.this.doc.fname; t = boolToInt(cur.this.title); - freq = float64(cur.this.freq) / float64(len(cur.this.doc.text)); + freq = float64(cur.this.freq) / float64(cur.this.doc.length); fmt.Fprintf(fd,"\t%s %d %.3f\n", fname, t, freq); } @@ -148,10 +159,125 @@ func printIndex(words []wordSort, fd *os.File) { } func init() { + var err error; log.SetOutput(os.Stderr); - r, _ = regexp.Compile("><"); - nonAN, _ = regexp.Compile("[^a-zA-Z0-9]+"); - stopWords, _ = regexp.Compile("( and\\W)|( a\\W)|( an\\W)|( and\\W)|( are\\W)|( as\\W)|( at\\W)|( be\\W)|( by\\W)|( for\\W)|( from\\W)|( has\\W)|( he\\W)|( in\\W)|( is\\W)|( it\\W)|( its\\W)|( of\\W)|( on\\W)|( that\\W)|( the\\W)|( to\\W)|( was\\W)|( were\\W)|( will\\W)|( with\\W)") + r, err = regexp.Compile("><"); + if err != nil { + panic(err); + } + nonAN, err = regexp.Compile("[^a-zA-Z0-9]+"); + if err != nil { + panic(err); + } + //TODO add func to read in stop words from a file; + stopWords = make([]*regexp.Regexp, 26) + if err != nil { + panic(err); + } + stopWords[0], err = regexp.Compile("\\W+and\\W+"); + if err != nil { + panic(err); + } + stopWords[1], err = regexp.Compile("\\W+a\\W+"); + if err != nil { + panic(err); + } + stopWords[2], err = regexp.Compile("\\W+an\\W+"); + if err != nil { + panic(err); + } + stopWords[3], err = regexp.Compile("\\W+and\\W+"); + if err != nil { + panic(err); + } + stopWords[4], err = regexp.Compile("\\W+are\\W+"); + if err != nil { + panic(err); + } + stopWords[5], err = regexp.Compile("\\W+as\\W+"); + if err != nil { + panic(err); + } + stopWords[6], err = regexp.Compile("\\W+at\\W+"); + if err != nil { + panic(err); + } + stopWords[7], err = regexp.Compile("\\W+be\\W+"); + if err != nil { + panic(err); + } + stopWords[8], err = regexp.Compile("\\W+by\\W+"); + if err != nil { + panic(err); + } + stopWords[9], err = regexp.Compile("\\W+for\\W+"); + if err != nil { + panic(err); + } + stopWords[10], err = regexp.Compile("\\W+from\\W+"); + if err != nil { + panic(err); + } + stopWords[11], err = regexp.Compile("\\W+has\\W+"); + if err != nil { + panic(err); + } + stopWords[12], err = regexp.Compile("\\W+he\\W+"); + if err != nil { + panic(err); + } + stopWords[13], err = regexp.Compile("\\W+in\\W+"); + if err != nil { + panic(err); + } + stopWords[14], err = regexp.Compile("\\W+is\\W+"); + if err != nil { + panic(err); + } + stopWords[15], err = regexp.Compile("\\W+it\\W+"); + if err != nil { + panic(err); + } + stopWords[16], err = regexp.Compile("\\W+its\\W+"); + if err != nil { + panic(err); + } + stopWords[17], err = regexp.Compile("\\W+of\\W+"); + if err != nil { + panic(err); + } + stopWords[18], err = regexp.Compile("\\W+on\\W+"); + if err != nil { + panic(err); + } + stopWords[19], err = regexp.Compile("\\W+that\\W+"); + if err != nil { + panic(err); + } + stopWords[20], err = regexp.Compile("\\W+the\\W+"); + if err != nil { + panic(err); + } + stopWords[21], err = regexp.Compile("\\W+to\\W+"); + if err != nil { + panic(err); + } + stopWords[22], err = regexp.Compile("\\W+was\\W+"); + if err != nil { + panic(err); + } + stopWords[23], err = regexp.Compile("\\W+were\\W+"); + if err != nil { + panic(err); + } + stopWords[24], err = regexp.Compile("\\W+will\\W+"); + if err != nil { + panic(err); + } + stopWords[25], err = regexp.Compile("\\W+with\\W+"); + if err != nil { + panic(err); + } } func main() { @@ -219,7 +345,7 @@ func main() { w = strings.ToLower(doc.text[j]); if words[w] == nil{ - tmp = &index{doc: doc, title: false, freq: 1}; + tmp = &index{doc: doc, title: false, freq: 0}; words[w] = &wordList{this: tmp, next: nil}; } @@ -239,7 +365,7 @@ func main() { w = strings.ToLower(doc.title[j]); if words[w] == nil{ - tmp = &index{doc: doc, title: true, freq: 1}; + tmp = &index{doc: doc, title: true, freq: 0}; words[w] = &wordList{this: tmp, next: nil}; } diff --git a/search/search.go b/search/search.go index 7905807..9c9bb38 100644 --- a/search/search.go +++ b/search/search.go @@ -1,5 +1,132 @@ +/************************************************ + * README * + * In order for search/index to be accessible * + * you must link this folder (search) into your * + * GOPATH * + ************************************************/ + + package main +import "search/index" +import "os" +import "fmt" +import "sort" + +type res struct { + doc string; + score float64; +}; + func main() { + var init_index, sIndex index.I_index; + var tmp, results, root *index.I_entry; + var tmp_score float64; + var scores map[string]map[string]float64; // scores[doc][word] == score + var i,j int; + var searchBool, perWord, docAdded map[string]bool; //map[doc]bool + var resultSort []res; + var err error; + + scores = make(map[string]map[string]float64); + searchBool = make(map[string]bool); + perWord = make(map[string]bool); + docAdded = make(map[string]bool); + + + sIndex = make(index.I_index); + + + + init_index, err = index.NewInvertedIndexFromFile("index.dat"); //TODO add flag for filename + if err != nil { + panic(err) + } + + for i = 1; i < len(os.Args); i++ { + sIndex[os.Args[i]] = init_index[os.Args[i]] + } + + for _, v := range sIndex { + for tmp = v; tmp != nil; tmp = tmp.Next { + perWord[tmp.This.Doc] = true; + searchBool[tmp.This.Doc] = true; + scores[tmp.This.Doc] = make(map[string]float64); + } + } + + for _, v := range sIndex { + for tmp = v; tmp != nil; tmp = tmp.Next { + perWord[tmp.This.Doc] = true; + } + + for d := range searchBool { + if _, o := perWord[d]; !o { + searchBool[d] = false; + } + } + perWord = make(map[string]bool); + } + + for k, v := range sIndex { + for tmp = v; tmp != nil; tmp = tmp.Next { + if searchBool[tmp.This.Doc] { + if tmp.This.In_title { + tmp_score = 1.0; + } else { + tmp_score = 0.0; + } + + scores[tmp.This.Doc][k] = (0.9 * tmp.This.Freq) + (0.1 * tmp_score); + } + } + + } + + i = 0; + results = &index.I_entry{nil, nil} + root = &index.I_entry{nil, nil}; + results.Next = root; + + j = 0; + + for _ ,v := range sIndex { + for tmp = v; tmp != nil; tmp = tmp.Next { + if (searchBool[tmp.This.Doc]) { + root.This = tmp.This; + docAdded[root.This.Doc] = false; + root.Next = &index.I_entry{nil, nil}; + root = root.Next; + j++ + } + } + } + + resultSort = make([]res, j); + + i = 0; + for root = results.Next; root.Next != nil; root = root.Next { + if (!docAdded[root.This.Doc]) { + j = 0; + tmp_score = 0; + for _ ,v := range scores[root.This.Doc] { + tmp_score += v; + j++; + } + tmp_score /= float64(j); + resultSort[i] = res{root.This.Doc, tmp_score}; + docAdded[root.This.Doc] = true; + i++; + } + } + resultSort = resultSort[:i]; + sort.Slice(resultSort, func(i, j int) bool { + return resultSort[i].score > resultSort[j].score; + }); + + fmt.Printf("Results: %d\n", len(resultSort)); + for i = 0; i < len(resultSort); i++ { + fmt.Printf("\t%d. Doc: %s, Score: %.3f\n", i, resultSort[i].doc, resultSort[i].score); + } } -- cgit v1.1