From 7e1216bde15e1ff7bd5f0359701bd9443f47c659 Mon Sep 17 00:00:00 2001 From: Tucker Evans Date: Mon, 11 Dec 2017 05:54:55 -0500 Subject: CSC2621/assignments/search: Started a index package for inxeder & searcher --- search/index.go | 344 +++++++++++++++++------------------------------------- search/indexer.go | 277 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 385 insertions(+), 236 deletions(-) create mode 100644 search/indexer.go (limited to 'search') diff --git a/search/index.go b/search/index.go index ee36c9a..b7dd453 100644 --- a/search/index.go +++ b/search/index.go @@ -1,277 +1,149 @@ -package main +package index import "os" import "sort" -import "golang.org/x/net/html" -import "log" -import "fmt" -import "github.com/PuerkitoBio/goquery" -import "github.com/kennygrant/sanitize" -import "strings" -import "flag" import "errors" -import "regexp" +import "strings" +import "strconv" -type document struct { - fname string; - title []string; - text []string; -} +/* TODO -type index struct { - doc *document; - title bool; - freq int; -} + - Implement Forward Creation + - Implement Inverted from Forward + - Switch Indexer.go over to this package -type wordSort struct { - w string; - root *wordList; -} +/********* + * Types * + *********/ -type wordList struct { - this *index - next *wordList -} +type f_info struct { + word string; + in_title bool; + freq float32; +}; -var r, nonAN, stopWords *regexp.Regexp; +type i_info struct { + doc string; + in_title bool; + freq float32; +}; +type f_entry { + this *f_info; + next *f_info; +}; -func newDocument() *document { - return &document{"" , nil, nil}; -} +type i_entry { + this *i_info; + next *i_info; +}; -func RemoveNode(r, rn *html.Node) { - var found bool; - var n, item *html.Node; - var nodes map[int]*html.Node; - var i, j int; +type f_index map[string]f_entry; +type i_index map[string]i_index; - found = false; - nodes = make(map[int]*html.Node); +type sortInverted { + w string; + root *i_entry; +}; - for n = r.FirstChild; n != nil; n = n.NextSibling { - if n == rn { - found = true; - n.Parent.RemoveChild(n); - } - nodes[i] = n; - i++; - } +/*************************** + * Forward Index Funcitons * + ***************************/ - if !found { - for j = 0; j < i; j++ { - item = nodes[j]; - RemoveNode(item, rn); - } - } -} -func RemoveTag(doc *goquery.Selection, tag string) { - doc.Find(tag).Each(func(i int, s *goquery.Selection) { - RemoveNode(doc.Get(0), s.Get(0)); - }); -} +func NewForwardEntryStrings(text, title []string) *f_entry, error{ -func logReg(h []byte) []byte { - log.Printf("RegExp: %s", h); - return h; } -func parseDoc(fd *os.File, f_info os.FileInfo) (*document, error) { - var err error; - var text, t_text string; - var doc *goquery.Document; - var body, title *goquery.Selection; - var r_doc *document; +/**************************** + * Inverted Index Functions * + ****************************/ - doc, err = goquery.NewDocumentFromReader(fd); +func NewInvertedIndexFromFile(fname string) i_index, error{ + var fd *os.File; + var br *bufio.Reader; + var err error; + var buf []byte; + var tmp i_info; + var cur *i_info; + var index i_index; + var word string + var info []string; + + fd, err = os.Open(fname); if err != nil { - log.Printf("goquery error: %s\n", err); - return nil, errors.New("Can't create goquery documnt"); + return nil, err; } - body = doc.Find("body"); - RemoveTag(body, "script"); - RemoveTag(body, "noscript"); - - title = doc.Find("title"); - - //TODO add error detection - text, err = body.Html(); - t_text, err = title.Html(); - - - text = r.ReplaceAllString(text, "> <"); - t_text = r.ReplaceAllString(text, "> <"); - - text = sanitize.HTML(text); - t_text = sanitize.HTML(t_text); + br, err = bufio.NewReader(fd); + if err != nil { + return nil, err; + } - text = nonAN.ReplaceAllString(text, " "); - t_text = nonAN.ReplaceAllString(t_text, " "); + for err != io.EOF { + buf, err = br.ReadBytes('\n'); + if buf[0] != '\t' { + word = strings.Trim(string(buf)); + } else { + tmp = i_info{nil, false, 0.0}; + info strings.Field(string(buf)); + tmp.String = info[0]; + tmp.in_title = (info[1] == 1); + tmp.freq = strconv.ParseFloat(info[2], 32); + if (index[word] == nil) { + index[word] = &tmp; + } else { + cur = index[word]; + for cur.next != nil { + cur = cur.next; + } + cur.next = &i_entry{this: &tmp, next: nil}; + } + } + } - text = stopWords.ReplaceAllString(text, ""); - t_text = stopWords.ReplaceAllString(t_text, ""); + return index; +} - r_doc = newDocument(); - r_doc.fname = f_info.Name(); - r_doc.text = strings.Fields(sanitize.HTML(text)); - r_doc.title = strings.Fields(sanitize.HTML(t_text)); +func NewInvertedFromForward(f f_index) i_index, error { - return r_doc, nil; -} -func boolToInt(t bool) int { - if t { - return 1; - } - return 0; } -func printIndex(words []wordSort, fd *os.File) { +func (x i_index) PrintToFile(fd *os.File) error{ var i int; - var cur *wordList; - var fname string; - var t int; - var freq float64; - - for i = 0; i < len(words); i++ { - fmt.Fprintf(fd, "%s\n", words[i].w); - for cur = words[i].root; cur != nil; cur = cur.next { - fname = cur.this.doc.fname; - t = boolToInt(cur.this.title); - freq = float64(cur.this.freq) / float64(len(cur.this.doc.text)); - - fmt.Fprintf(fd,"\t%s %d %.3f\n", fname, t, freq); + var cur *i_entry; + var index []sortInverted; + + index = x.sortIndex(); + + for i = 0; i < len(index); i++ { + fmt.Fprintf(fd, "%s\n", index[i].w); + for cur = index[i].root; cur != nil; cur = cur.next { + fmt.Fprintf(fd, "\t%s %d %.3f", cur.this.doc, toInt(cur.this.in_title), cur.this.freq); } } } -func init() { - log.SetOutput(os.Stderr); - r, _ = regexp.Compile("><"); - nonAN, _ = regexp.Compile("[^a-zA-Z0-9]+"); - stopWords, _ = regexp.Compile("( and\\W)|( a\\W)|( an\\W)|( and\\W)|( are\\W)|( as\\W)|( at\\W)|( be\\W)|( by\\W)|( for\\W)|( from\\W)|( has\\W)|( he\\W)|( in\\W)|( is\\W)|( it\\W)|( its\\W)|( of\\W)|( on\\W)|( that\\W)|( the\\W)|( to\\W)|( was\\W)|( were\\W)|( will\\W)|( with\\W)") +func toInt(t bool) int{ + if t + return 1; + return 0; } -func main() { - // var words map[string]index - var p_dir, w, fname string; - var err error; - var i, j int; - var words map[string]*wordList; - var cur *wordList; - var tmp *index; - var sorted []wordSort; - - var files []os.FileInfo; - var dir, fd *os.File; - var dir_info, fd_info os.FileInfo; - var dir_mode os.FileMode; - - var doc *document; - - flag.StringVar(&p_dir, "d", "./pages", "pages directory"); - - flag.Parse(); - - words = make(map[string]*wordList); - - dir, err = os.Open(p_dir); - if err != nil { - log.Printf("Error accessing \"%s\":\t%s\n", p_dir, err); - os.Exit(1); - } - - dir_info, err = dir.Stat(); - dir_mode = dir_info.Mode(); +func (unsort i_index) sortIndex() []sortInverted { + var i int; + var sorted []sortInverted; - if !dir_mode.IsDir() { - log.Printf("\"%s\" is not a directory\n", p_dir); - os.Exit(1); - } + sorted = make([]sortInverted, len(unsort)); - files, err = dir.Readdir(0); - if err != nil { - log.Printf("Error reading %s\n", p_dir); - os.Exit(1); + i = 0; + for k, v := range unsort { + sorted[i].w = k; + sorted[i].root = v; + i++; } - for i = 0; i < len(files); i++ { - fd, err = os.Open(fmt.Sprintf("%s/%s", dir_info.Name(), files[i].Name())); - fd_info, err = fd.Stat(); - if err != nil { - log.Printf("Error getting info\n"); - os.Exit(1); - } - fname = fd_info.Name(); - - if err != nil { - log.Printf("Error reading %s/%s\n", dir_info.Name(), files[i].Name()); - } else { - fmt.Printf("Indexing %s...\n", fname); - doc, err = parseDoc(fd, fd_info); - if err != nil { - log.Printf("Error parsing %s/%s\n", dir_info.Name(), files[i].Name()); - } else { - /* Text */ - for j = 0; j < len(doc.text); j++ { - w = strings.ToLower(doc.text[j]); - - if words[w] == nil{ - tmp = &index{doc: doc, title: false, freq: 1}; - words[w] = &wordList{this: tmp, next: nil}; - } - - for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{} - - if cur.this.doc.fname == fname { - cur.this.freq++ - } else if cur.next == nil { - tmp = &index{doc: doc, title: false, freq: 1}; - cur.next = &wordList{this: tmp, next: nil}; - } else { - panic(fmt.Sprintf("%v", cur)); - } - } - /* Title */ - for j = 0; j < len(doc.title); j++ { - w = strings.ToLower(doc.title[j]); - - if words[w] == nil{ - tmp = &index{doc: doc, title: true, freq: 1}; - words[w] = &wordList{this: tmp, next: nil}; - } - - for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{} - - if cur.this.doc.fname == fname { - cur.this.title = true; - cur.this.freq++; - } else if cur.next == nil { - tmp = &index{doc: doc, title: true, freq: 1}; - cur.next = &wordList{this: tmp, next: nil}; - } else { - panic(fmt.Sprintf("%v", cur)); - } - } - } - } - fd.Close(); - } - sorted = make([]wordSort, len(words)); - i = 0; - for k,v := range words { - sorted[i].w = k; - sorted[i].root = v; - i++; - } - - sort.Slice(sorted, func(i, j int) bool { - return sorted[i].w < sorted[j].w; - }); - - fd,_ = os.Create("index.dat"); - printIndex(sorted, fd); - fd.Close(); + sort.Slice(sorted, func(i, j int) bool { + return sorted[i].w < sorted[j].w; + }); } diff --git a/search/indexer.go b/search/indexer.go new file mode 100644 index 0000000..ee36c9a --- /dev/null +++ b/search/indexer.go @@ -0,0 +1,277 @@ +package main + +import "os" +import "sort" +import "golang.org/x/net/html" +import "log" +import "fmt" +import "github.com/PuerkitoBio/goquery" +import "github.com/kennygrant/sanitize" +import "strings" +import "flag" +import "errors" +import "regexp" + +type document struct { + fname string; + title []string; + text []string; +} + +type index struct { + doc *document; + title bool; + freq int; +} + +type wordSort struct { + w string; + root *wordList; +} + +type wordList struct { + this *index + next *wordList +} + +var r, nonAN, stopWords *regexp.Regexp; + + +func newDocument() *document { + return &document{"" , nil, nil}; +} + +func RemoveNode(r, rn *html.Node) { + var found bool; + var n, item *html.Node; + var nodes map[int]*html.Node; + var i, j int; + + found = false; + nodes = make(map[int]*html.Node); + + for n = r.FirstChild; n != nil; n = n.NextSibling { + if n == rn { + found = true; + n.Parent.RemoveChild(n); + } + + nodes[i] = n; + i++; + } + + if !found { + for j = 0; j < i; j++ { + item = nodes[j]; + RemoveNode(item, rn); + } + } +} +func RemoveTag(doc *goquery.Selection, tag string) { + doc.Find(tag).Each(func(i int, s *goquery.Selection) { + RemoveNode(doc.Get(0), s.Get(0)); + }); +} + +func logReg(h []byte) []byte { + log.Printf("RegExp: %s", h); + return h; +} + +func parseDoc(fd *os.File, f_info os.FileInfo) (*document, error) { + var err error; + var text, t_text string; + var doc *goquery.Document; + var body, title *goquery.Selection; + var r_doc *document; + + doc, err = goquery.NewDocumentFromReader(fd); + if err != nil { + log.Printf("goquery error: %s\n", err); + return nil, errors.New("Can't create goquery documnt"); + } + + body = doc.Find("body"); + RemoveTag(body, "script"); + RemoveTag(body, "noscript"); + + title = doc.Find("title"); + + //TODO add error detection + text, err = body.Html(); + t_text, err = title.Html(); + + + text = r.ReplaceAllString(text, "> <"); + t_text = r.ReplaceAllString(text, "> <"); + + text = sanitize.HTML(text); + t_text = sanitize.HTML(t_text); + + text = nonAN.ReplaceAllString(text, " "); + t_text = nonAN.ReplaceAllString(t_text, " "); + + text = stopWords.ReplaceAllString(text, ""); + t_text = stopWords.ReplaceAllString(t_text, ""); + + r_doc = newDocument(); + r_doc.fname = f_info.Name(); + r_doc.text = strings.Fields(sanitize.HTML(text)); + r_doc.title = strings.Fields(sanitize.HTML(t_text)); + + return r_doc, nil; +} +func boolToInt(t bool) int { + if t { + return 1; + } + return 0; +} + +func printIndex(words []wordSort, fd *os.File) { + var i int; + var cur *wordList; + var fname string; + var t int; + var freq float64; + + for i = 0; i < len(words); i++ { + fmt.Fprintf(fd, "%s\n", words[i].w); + for cur = words[i].root; cur != nil; cur = cur.next { + fname = cur.this.doc.fname; + t = boolToInt(cur.this.title); + freq = float64(cur.this.freq) / float64(len(cur.this.doc.text)); + + fmt.Fprintf(fd,"\t%s %d %.3f\n", fname, t, freq); + } + } +} + +func init() { + log.SetOutput(os.Stderr); + r, _ = regexp.Compile("><"); + nonAN, _ = regexp.Compile("[^a-zA-Z0-9]+"); + stopWords, _ = regexp.Compile("( and\\W)|( a\\W)|( an\\W)|( and\\W)|( are\\W)|( as\\W)|( at\\W)|( be\\W)|( by\\W)|( for\\W)|( from\\W)|( has\\W)|( he\\W)|( in\\W)|( is\\W)|( it\\W)|( its\\W)|( of\\W)|( on\\W)|( that\\W)|( the\\W)|( to\\W)|( was\\W)|( were\\W)|( will\\W)|( with\\W)") +} + +func main() { + // var words map[string]index + var p_dir, w, fname string; + var err error; + var i, j int; + var words map[string]*wordList; + var cur *wordList; + var tmp *index; + var sorted []wordSort; + + var files []os.FileInfo; + var dir, fd *os.File; + var dir_info, fd_info os.FileInfo; + var dir_mode os.FileMode; + + var doc *document; + + flag.StringVar(&p_dir, "d", "./pages", "pages directory"); + + flag.Parse(); + + words = make(map[string]*wordList); + + dir, err = os.Open(p_dir); + if err != nil { + log.Printf("Error accessing \"%s\":\t%s\n", p_dir, err); + os.Exit(1); + } + + dir_info, err = dir.Stat(); + dir_mode = dir_info.Mode(); + + if !dir_mode.IsDir() { + log.Printf("\"%s\" is not a directory\n", p_dir); + os.Exit(1); + } + + files, err = dir.Readdir(0); + if err != nil { + log.Printf("Error reading %s\n", p_dir); + os.Exit(1); + } + + for i = 0; i < len(files); i++ { + fd, err = os.Open(fmt.Sprintf("%s/%s", dir_info.Name(), files[i].Name())); + fd_info, err = fd.Stat(); + if err != nil { + log.Printf("Error getting info\n"); + os.Exit(1); + } + fname = fd_info.Name(); + + if err != nil { + log.Printf("Error reading %s/%s\n", dir_info.Name(), files[i].Name()); + } else { + fmt.Printf("Indexing %s...\n", fname); + doc, err = parseDoc(fd, fd_info); + if err != nil { + log.Printf("Error parsing %s/%s\n", dir_info.Name(), files[i].Name()); + } else { + /* Text */ + for j = 0; j < len(doc.text); j++ { + w = strings.ToLower(doc.text[j]); + + if words[w] == nil{ + tmp = &index{doc: doc, title: false, freq: 1}; + words[w] = &wordList{this: tmp, next: nil}; + } + + for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{} + + if cur.this.doc.fname == fname { + cur.this.freq++ + } else if cur.next == nil { + tmp = &index{doc: doc, title: false, freq: 1}; + cur.next = &wordList{this: tmp, next: nil}; + } else { + panic(fmt.Sprintf("%v", cur)); + } + } + /* Title */ + for j = 0; j < len(doc.title); j++ { + w = strings.ToLower(doc.title[j]); + + if words[w] == nil{ + tmp = &index{doc: doc, title: true, freq: 1}; + words[w] = &wordList{this: tmp, next: nil}; + } + + for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{} + + if cur.this.doc.fname == fname { + cur.this.title = true; + cur.this.freq++; + } else if cur.next == nil { + tmp = &index{doc: doc, title: true, freq: 1}; + cur.next = &wordList{this: tmp, next: nil}; + } else { + panic(fmt.Sprintf("%v", cur)); + } + } + } + } + fd.Close(); + } + sorted = make([]wordSort, len(words)); + i = 0; + for k,v := range words { + sorted[i].w = k; + sorted[i].root = v; + i++; + } + + sort.Slice(sorted, func(i, j int) bool { + return sorted[i].w < sorted[j].w; + }); + + fd,_ = os.Create("index.dat"); + printIndex(sorted, fd); + fd.Close(); +} -- cgit v1.1