diff options
Diffstat (limited to 'search/indexer.go')
-rw-r--r-- | search/indexer.go | 152 |
1 files changed, 139 insertions, 13 deletions
diff --git a/search/indexer.go b/search/indexer.go index ee36c9a..9550c86 100644 --- a/search/indexer.go +++ b/search/indexer.go @@ -16,6 +16,7 @@ type document struct { fname string; title []string; text []string; + length int; } type index struct { @@ -34,11 +35,12 @@ type wordList struct { next *wordList } -var r, nonAN, stopWords *regexp.Regexp; +var r, nonAN *regexp.Regexp; +var stopWords []*regexp.Regexp; func newDocument() *document { - return &document{"" , nil, nil}; + return &document{"" , nil, nil, 0}; } func RemoveNode(r, rn *html.Node) { @@ -84,6 +86,7 @@ func parseDoc(fd *os.File, f_info os.FileInfo) (*document, error) { var doc *goquery.Document; var body, title *goquery.Selection; var r_doc *document; + var i int; doc, err = goquery.NewDocumentFromReader(fd); if err != nil { @@ -103,21 +106,29 @@ func parseDoc(fd *os.File, f_info os.FileInfo) (*document, error) { text = r.ReplaceAllString(text, "> <"); - t_text = r.ReplaceAllString(text, "> <"); + t_text = r.ReplaceAllString(t_text, "> <"); text = sanitize.HTML(text); t_text = sanitize.HTML(t_text); + text = strings.ToLower(text); + t_text = strings.ToLower(t_text); + text = nonAN.ReplaceAllString(text, " "); t_text = nonAN.ReplaceAllString(t_text, " "); - text = stopWords.ReplaceAllString(text, ""); - t_text = stopWords.ReplaceAllString(t_text, ""); + for i = 0; i < len(stopWords); i++ { + text = stopWords[i].ReplaceAllString(text, " "); + t_text = stopWords[i].ReplaceAllString(t_text, " "); + } r_doc = newDocument(); + r_doc.fname = f_info.Name(); - r_doc.text = strings.Fields(sanitize.HTML(text)); - r_doc.title = strings.Fields(sanitize.HTML(t_text)); + r_doc.text = strings.Fields(text); + r_doc.title = strings.Fields(t_text); + r_doc.length = len(r_doc.text) + len(r_doc.title); + fmt.Println(r_doc.length) return r_doc, nil; } @@ -140,7 +151,7 @@ func printIndex(words []wordSort, fd *os.File) { for cur = words[i].root; cur != nil; cur = cur.next { fname = cur.this.doc.fname; t = boolToInt(cur.this.title); - freq = float64(cur.this.freq) / float64(len(cur.this.doc.text)); + freq = float64(cur.this.freq) / float64(cur.this.doc.length); fmt.Fprintf(fd,"\t%s %d %.3f\n", fname, t, freq); } @@ -148,10 +159,125 @@ func printIndex(words []wordSort, fd *os.File) { } func init() { + var err error; log.SetOutput(os.Stderr); - r, _ = regexp.Compile("><"); - nonAN, _ = regexp.Compile("[^a-zA-Z0-9]+"); - stopWords, _ = regexp.Compile("( and\\W)|( a\\W)|( an\\W)|( and\\W)|( are\\W)|( as\\W)|( at\\W)|( be\\W)|( by\\W)|( for\\W)|( from\\W)|( has\\W)|( he\\W)|( in\\W)|( is\\W)|( it\\W)|( its\\W)|( of\\W)|( on\\W)|( that\\W)|( the\\W)|( to\\W)|( was\\W)|( were\\W)|( will\\W)|( with\\W)") + r, err = regexp.Compile("><"); + if err != nil { + panic(err); + } + nonAN, err = regexp.Compile("[^a-zA-Z0-9]+"); + if err != nil { + panic(err); + } + //TODO add func to read in stop words from a file; + stopWords = make([]*regexp.Regexp, 26) + if err != nil { + panic(err); + } + stopWords[0], err = regexp.Compile("\\W+and\\W+"); + if err != nil { + panic(err); + } + stopWords[1], err = regexp.Compile("\\W+a\\W+"); + if err != nil { + panic(err); + } + stopWords[2], err = regexp.Compile("\\W+an\\W+"); + if err != nil { + panic(err); + } + stopWords[3], err = regexp.Compile("\\W+and\\W+"); + if err != nil { + panic(err); + } + stopWords[4], err = regexp.Compile("\\W+are\\W+"); + if err != nil { + panic(err); + } + stopWords[5], err = regexp.Compile("\\W+as\\W+"); + if err != nil { + panic(err); + } + stopWords[6], err = regexp.Compile("\\W+at\\W+"); + if err != nil { + panic(err); + } + stopWords[7], err = regexp.Compile("\\W+be\\W+"); + if err != nil { + panic(err); + } + stopWords[8], err = regexp.Compile("\\W+by\\W+"); + if err != nil { + panic(err); + } + stopWords[9], err = regexp.Compile("\\W+for\\W+"); + if err != nil { + panic(err); + } + stopWords[10], err = regexp.Compile("\\W+from\\W+"); + if err != nil { + panic(err); + } + stopWords[11], err = regexp.Compile("\\W+has\\W+"); + if err != nil { + panic(err); + } + stopWords[12], err = regexp.Compile("\\W+he\\W+"); + if err != nil { + panic(err); + } + stopWords[13], err = regexp.Compile("\\W+in\\W+"); + if err != nil { + panic(err); + } + stopWords[14], err = regexp.Compile("\\W+is\\W+"); + if err != nil { + panic(err); + } + stopWords[15], err = regexp.Compile("\\W+it\\W+"); + if err != nil { + panic(err); + } + stopWords[16], err = regexp.Compile("\\W+its\\W+"); + if err != nil { + panic(err); + } + stopWords[17], err = regexp.Compile("\\W+of\\W+"); + if err != nil { + panic(err); + } + stopWords[18], err = regexp.Compile("\\W+on\\W+"); + if err != nil { + panic(err); + } + stopWords[19], err = regexp.Compile("\\W+that\\W+"); + if err != nil { + panic(err); + } + stopWords[20], err = regexp.Compile("\\W+the\\W+"); + if err != nil { + panic(err); + } + stopWords[21], err = regexp.Compile("\\W+to\\W+"); + if err != nil { + panic(err); + } + stopWords[22], err = regexp.Compile("\\W+was\\W+"); + if err != nil { + panic(err); + } + stopWords[23], err = regexp.Compile("\\W+were\\W+"); + if err != nil { + panic(err); + } + stopWords[24], err = regexp.Compile("\\W+will\\W+"); + if err != nil { + panic(err); + } + stopWords[25], err = regexp.Compile("\\W+with\\W+"); + if err != nil { + panic(err); + } } func main() { @@ -219,7 +345,7 @@ func main() { w = strings.ToLower(doc.text[j]); if words[w] == nil{ - tmp = &index{doc: doc, title: false, freq: 1}; + tmp = &index{doc: doc, title: false, freq: 0}; words[w] = &wordList{this: tmp, next: nil}; } @@ -239,7 +365,7 @@ func main() { w = strings.ToLower(doc.title[j]); if words[w] == nil{ - tmp = &index{doc: doc, title: true, freq: 1}; + tmp = &index{doc: doc, title: true, freq: 0}; words[w] = &wordList{this: tmp, next: nil}; } |