aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTucker Evans <tuckerevans24@gmail.com>2017-12-11 05:54:55 -0500
committerTucker Evans <tuckerevans24@gmail.com>2017-12-17 13:20:31 -0500
commit7e1216bde15e1ff7bd5f0359701bd9443f47c659 (patch)
tree4c91c207ec0d489aad8f134a4884e376abe06deb
parentf36e8a4ad0a22672a03c2c6dc58f68215dfeef22 (diff)
CSC2621/assignments/search: Started a index package for inxeder & searcher
-rw-r--r--search/index.go344
-rw-r--r--search/indexer.go277
2 files changed, 385 insertions, 236 deletions
diff --git a/search/index.go b/search/index.go
index ee36c9a..b7dd453 100644
--- a/search/index.go
+++ b/search/index.go
@@ -1,277 +1,149 @@
-package main
+package index
import "os"
import "sort"
-import "golang.org/x/net/html"
-import "log"
-import "fmt"
-import "github.com/PuerkitoBio/goquery"
-import "github.com/kennygrant/sanitize"
-import "strings"
-import "flag"
import "errors"
-import "regexp"
+import "strings"
+import "strconv"
-type document struct {
- fname string;
- title []string;
- text []string;
-}
+/* TODO
-type index struct {
- doc *document;
- title bool;
- freq int;
-}
+ - Implement Forward Creation
+ - Implement Inverted from Forward
+ - Switch Indexer.go over to this package
-type wordSort struct {
- w string;
- root *wordList;
-}
+/*********
+ * Types *
+ *********/
-type wordList struct {
- this *index
- next *wordList
-}
+type f_info struct {
+ word string;
+ in_title bool;
+ freq float32;
+};
-var r, nonAN, stopWords *regexp.Regexp;
+type i_info struct {
+ doc string;
+ in_title bool;
+ freq float32;
+};
+type f_entry {
+ this *f_info;
+ next *f_info;
+};
-func newDocument() *document {
- return &document{"" , nil, nil};
-}
+type i_entry {
+ this *i_info;
+ next *i_info;
+};
-func RemoveNode(r, rn *html.Node) {
- var found bool;
- var n, item *html.Node;
- var nodes map[int]*html.Node;
- var i, j int;
+type f_index map[string]f_entry;
+type i_index map[string]i_index;
- found = false;
- nodes = make(map[int]*html.Node);
+type sortInverted {
+ w string;
+ root *i_entry;
+};
- for n = r.FirstChild; n != nil; n = n.NextSibling {
- if n == rn {
- found = true;
- n.Parent.RemoveChild(n);
- }
- nodes[i] = n;
- i++;
- }
+/***************************
+ * Forward Index Funcitons *
+ ***************************/
- if !found {
- for j = 0; j < i; j++ {
- item = nodes[j];
- RemoveNode(item, rn);
- }
- }
-}
-func RemoveTag(doc *goquery.Selection, tag string) {
- doc.Find(tag).Each(func(i int, s *goquery.Selection) {
- RemoveNode(doc.Get(0), s.Get(0));
- });
-}
+func NewForwardEntryStrings(text, title []string) *f_entry, error{
-func logReg(h []byte) []byte {
- log.Printf("RegExp: %s", h);
- return h;
}
-func parseDoc(fd *os.File, f_info os.FileInfo) (*document, error) {
- var err error;
- var text, t_text string;
- var doc *goquery.Document;
- var body, title *goquery.Selection;
- var r_doc *document;
+/****************************
+ * Inverted Index Functions *
+ ****************************/
- doc, err = goquery.NewDocumentFromReader(fd);
+func NewInvertedIndexFromFile(fname string) i_index, error{
+ var fd *os.File;
+ var br *bufio.Reader;
+ var err error;
+ var buf []byte;
+ var tmp i_info;
+ var cur *i_info;
+ var index i_index;
+ var word string
+ var info []string;
+
+ fd, err = os.Open(fname);
if err != nil {
- log.Printf("goquery error: %s\n", err);
- return nil, errors.New("Can't create goquery documnt");
+ return nil, err;
}
- body = doc.Find("body");
- RemoveTag(body, "script");
- RemoveTag(body, "noscript");
-
- title = doc.Find("title");
-
- //TODO add error detection
- text, err = body.Html();
- t_text, err = title.Html();
-
-
- text = r.ReplaceAllString(text, "> <");
- t_text = r.ReplaceAllString(text, "> <");
-
- text = sanitize.HTML(text);
- t_text = sanitize.HTML(t_text);
+ br, err = bufio.NewReader(fd);
+ if err != nil {
+ return nil, err;
+ }
- text = nonAN.ReplaceAllString(text, " ");
- t_text = nonAN.ReplaceAllString(t_text, " ");
+ for err != io.EOF {
+ buf, err = br.ReadBytes('\n');
+ if buf[0] != '\t' {
+ word = strings.Trim(string(buf));
+ } else {
+ tmp = i_info{nil, false, 0.0};
+ info strings.Field(string(buf));
+ tmp.String = info[0];
+ tmp.in_title = (info[1] == 1);
+ tmp.freq = strconv.ParseFloat(info[2], 32);
+ if (index[word] == nil) {
+ index[word] = &tmp;
+ } else {
+ cur = index[word];
+ for cur.next != nil {
+ cur = cur.next;
+ }
+ cur.next = &i_entry{this: &tmp, next: nil};
+ }
+ }
+ }
- text = stopWords.ReplaceAllString(text, "");
- t_text = stopWords.ReplaceAllString(t_text, "");
+ return index;
+}
- r_doc = newDocument();
- r_doc.fname = f_info.Name();
- r_doc.text = strings.Fields(sanitize.HTML(text));
- r_doc.title = strings.Fields(sanitize.HTML(t_text));
+func NewInvertedFromForward(f f_index) i_index, error {
- return r_doc, nil;
-}
-func boolToInt(t bool) int {
- if t {
- return 1;
- }
- return 0;
}
-func printIndex(words []wordSort, fd *os.File) {
+func (x i_index) PrintToFile(fd *os.File) error{
var i int;
- var cur *wordList;
- var fname string;
- var t int;
- var freq float64;
-
- for i = 0; i < len(words); i++ {
- fmt.Fprintf(fd, "%s\n", words[i].w);
- for cur = words[i].root; cur != nil; cur = cur.next {
- fname = cur.this.doc.fname;
- t = boolToInt(cur.this.title);
- freq = float64(cur.this.freq) / float64(len(cur.this.doc.text));
-
- fmt.Fprintf(fd,"\t%s %d %.3f\n", fname, t, freq);
+ var cur *i_entry;
+ var index []sortInverted;
+
+ index = x.sortIndex();
+
+ for i = 0; i < len(index); i++ {
+ fmt.Fprintf(fd, "%s\n", index[i].w);
+ for cur = index[i].root; cur != nil; cur = cur.next {
+ fmt.Fprintf(fd, "\t%s %d %.3f", cur.this.doc, toInt(cur.this.in_title), cur.this.freq);
}
}
}
-func init() {
- log.SetOutput(os.Stderr);
- r, _ = regexp.Compile("><");
- nonAN, _ = regexp.Compile("[^a-zA-Z0-9]+");
- stopWords, _ = regexp.Compile("( and\\W)|( a\\W)|( an\\W)|( and\\W)|( are\\W)|( as\\W)|( at\\W)|( be\\W)|( by\\W)|( for\\W)|( from\\W)|( has\\W)|( he\\W)|( in\\W)|( is\\W)|( it\\W)|( its\\W)|( of\\W)|( on\\W)|( that\\W)|( the\\W)|( to\\W)|( was\\W)|( were\\W)|( will\\W)|( with\\W)")
+func toInt(t bool) int{
+ if t
+ return 1;
+ return 0;
}
-func main() {
- // var words map[string]index
- var p_dir, w, fname string;
- var err error;
- var i, j int;
- var words map[string]*wordList;
- var cur *wordList;
- var tmp *index;
- var sorted []wordSort;
-
- var files []os.FileInfo;
- var dir, fd *os.File;
- var dir_info, fd_info os.FileInfo;
- var dir_mode os.FileMode;
-
- var doc *document;
-
- flag.StringVar(&p_dir, "d", "./pages", "pages directory");
-
- flag.Parse();
-
- words = make(map[string]*wordList);
-
- dir, err = os.Open(p_dir);
- if err != nil {
- log.Printf("Error accessing \"%s\":\t%s\n", p_dir, err);
- os.Exit(1);
- }
-
- dir_info, err = dir.Stat();
- dir_mode = dir_info.Mode();
+func (unsort i_index) sortIndex() []sortInverted {
+ var i int;
+ var sorted []sortInverted;
- if !dir_mode.IsDir() {
- log.Printf("\"%s\" is not a directory\n", p_dir);
- os.Exit(1);
- }
+ sorted = make([]sortInverted, len(unsort));
- files, err = dir.Readdir(0);
- if err != nil {
- log.Printf("Error reading %s\n", p_dir);
- os.Exit(1);
+ i = 0;
+ for k, v := range unsort {
+ sorted[i].w = k;
+ sorted[i].root = v;
+ i++;
}
- for i = 0; i < len(files); i++ {
- fd, err = os.Open(fmt.Sprintf("%s/%s", dir_info.Name(), files[i].Name()));
- fd_info, err = fd.Stat();
- if err != nil {
- log.Printf("Error getting info\n");
- os.Exit(1);
- }
- fname = fd_info.Name();
-
- if err != nil {
- log.Printf("Error reading %s/%s\n", dir_info.Name(), files[i].Name());
- } else {
- fmt.Printf("Indexing %s...\n", fname);
- doc, err = parseDoc(fd, fd_info);
- if err != nil {
- log.Printf("Error parsing %s/%s\n", dir_info.Name(), files[i].Name());
- } else {
- /* Text */
- for j = 0; j < len(doc.text); j++ {
- w = strings.ToLower(doc.text[j]);
-
- if words[w] == nil{
- tmp = &index{doc: doc, title: false, freq: 1};
- words[w] = &wordList{this: tmp, next: nil};
- }
-
- for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{}
-
- if cur.this.doc.fname == fname {
- cur.this.freq++
- } else if cur.next == nil {
- tmp = &index{doc: doc, title: false, freq: 1};
- cur.next = &wordList{this: tmp, next: nil};
- } else {
- panic(fmt.Sprintf("%v", cur));
- }
- }
- /* Title */
- for j = 0; j < len(doc.title); j++ {
- w = strings.ToLower(doc.title[j]);
-
- if words[w] == nil{
- tmp = &index{doc: doc, title: true, freq: 1};
- words[w] = &wordList{this: tmp, next: nil};
- }
-
- for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{}
-
- if cur.this.doc.fname == fname {
- cur.this.title = true;
- cur.this.freq++;
- } else if cur.next == nil {
- tmp = &index{doc: doc, title: true, freq: 1};
- cur.next = &wordList{this: tmp, next: nil};
- } else {
- panic(fmt.Sprintf("%v", cur));
- }
- }
- }
- }
- fd.Close();
- }
- sorted = make([]wordSort, len(words));
- i = 0;
- for k,v := range words {
- sorted[i].w = k;
- sorted[i].root = v;
- i++;
- }
-
- sort.Slice(sorted, func(i, j int) bool {
- return sorted[i].w < sorted[j].w;
- });
-
- fd,_ = os.Create("index.dat");
- printIndex(sorted, fd);
- fd.Close();
+ sort.Slice(sorted, func(i, j int) bool {
+ return sorted[i].w < sorted[j].w;
+ });
}
diff --git a/search/indexer.go b/search/indexer.go
new file mode 100644
index 0000000..ee36c9a
--- /dev/null
+++ b/search/indexer.go
@@ -0,0 +1,277 @@
+package main
+
+import "os"
+import "sort"
+import "golang.org/x/net/html"
+import "log"
+import "fmt"
+import "github.com/PuerkitoBio/goquery"
+import "github.com/kennygrant/sanitize"
+import "strings"
+import "flag"
+import "errors"
+import "regexp"
+
+type document struct {
+ fname string;
+ title []string;
+ text []string;
+}
+
+type index struct {
+ doc *document;
+ title bool;
+ freq int;
+}
+
+type wordSort struct {
+ w string;
+ root *wordList;
+}
+
+type wordList struct {
+ this *index
+ next *wordList
+}
+
+var r, nonAN, stopWords *regexp.Regexp;
+
+
+func newDocument() *document {
+ return &document{"" , nil, nil};
+}
+
+func RemoveNode(r, rn *html.Node) {
+ var found bool;
+ var n, item *html.Node;
+ var nodes map[int]*html.Node;
+ var i, j int;
+
+ found = false;
+ nodes = make(map[int]*html.Node);
+
+ for n = r.FirstChild; n != nil; n = n.NextSibling {
+ if n == rn {
+ found = true;
+ n.Parent.RemoveChild(n);
+ }
+
+ nodes[i] = n;
+ i++;
+ }
+
+ if !found {
+ for j = 0; j < i; j++ {
+ item = nodes[j];
+ RemoveNode(item, rn);
+ }
+ }
+}
+func RemoveTag(doc *goquery.Selection, tag string) {
+ doc.Find(tag).Each(func(i int, s *goquery.Selection) {
+ RemoveNode(doc.Get(0), s.Get(0));
+ });
+}
+
+func logReg(h []byte) []byte {
+ log.Printf("RegExp: %s", h);
+ return h;
+}
+
+func parseDoc(fd *os.File, f_info os.FileInfo) (*document, error) {
+ var err error;
+ var text, t_text string;
+ var doc *goquery.Document;
+ var body, title *goquery.Selection;
+ var r_doc *document;
+
+ doc, err = goquery.NewDocumentFromReader(fd);
+ if err != nil {
+ log.Printf("goquery error: %s\n", err);
+ return nil, errors.New("Can't create goquery documnt");
+ }
+
+ body = doc.Find("body");
+ RemoveTag(body, "script");
+ RemoveTag(body, "noscript");
+
+ title = doc.Find("title");
+
+ //TODO add error detection
+ text, err = body.Html();
+ t_text, err = title.Html();
+
+
+ text = r.ReplaceAllString(text, "> <");
+ t_text = r.ReplaceAllString(text, "> <");
+
+ text = sanitize.HTML(text);
+ t_text = sanitize.HTML(t_text);
+
+ text = nonAN.ReplaceAllString(text, " ");
+ t_text = nonAN.ReplaceAllString(t_text, " ");
+
+ text = stopWords.ReplaceAllString(text, "");
+ t_text = stopWords.ReplaceAllString(t_text, "");
+
+ r_doc = newDocument();
+ r_doc.fname = f_info.Name();
+ r_doc.text = strings.Fields(sanitize.HTML(text));
+ r_doc.title = strings.Fields(sanitize.HTML(t_text));
+
+ return r_doc, nil;
+}
+func boolToInt(t bool) int {
+ if t {
+ return 1;
+ }
+ return 0;
+}
+
+func printIndex(words []wordSort, fd *os.File) {
+ var i int;
+ var cur *wordList;
+ var fname string;
+ var t int;
+ var freq float64;
+
+ for i = 0; i < len(words); i++ {
+ fmt.Fprintf(fd, "%s\n", words[i].w);
+ for cur = words[i].root; cur != nil; cur = cur.next {
+ fname = cur.this.doc.fname;
+ t = boolToInt(cur.this.title);
+ freq = float64(cur.this.freq) / float64(len(cur.this.doc.text));
+
+ fmt.Fprintf(fd,"\t%s %d %.3f\n", fname, t, freq);
+ }
+ }
+}
+
+func init() {
+ log.SetOutput(os.Stderr);
+ r, _ = regexp.Compile("><");
+ nonAN, _ = regexp.Compile("[^a-zA-Z0-9]+");
+ stopWords, _ = regexp.Compile("( and\\W)|( a\\W)|( an\\W)|( and\\W)|( are\\W)|( as\\W)|( at\\W)|( be\\W)|( by\\W)|( for\\W)|( from\\W)|( has\\W)|( he\\W)|( in\\W)|( is\\W)|( it\\W)|( its\\W)|( of\\W)|( on\\W)|( that\\W)|( the\\W)|( to\\W)|( was\\W)|( were\\W)|( will\\W)|( with\\W)")
+}
+
+func main() {
+ // var words map[string]index
+ var p_dir, w, fname string;
+ var err error;
+ var i, j int;
+ var words map[string]*wordList;
+ var cur *wordList;
+ var tmp *index;
+ var sorted []wordSort;
+
+ var files []os.FileInfo;
+ var dir, fd *os.File;
+ var dir_info, fd_info os.FileInfo;
+ var dir_mode os.FileMode;
+
+ var doc *document;
+
+ flag.StringVar(&p_dir, "d", "./pages", "pages directory");
+
+ flag.Parse();
+
+ words = make(map[string]*wordList);
+
+ dir, err = os.Open(p_dir);
+ if err != nil {
+ log.Printf("Error accessing \"%s\":\t%s\n", p_dir, err);
+ os.Exit(1);
+ }
+
+ dir_info, err = dir.Stat();
+ dir_mode = dir_info.Mode();
+
+ if !dir_mode.IsDir() {
+ log.Printf("\"%s\" is not a directory\n", p_dir);
+ os.Exit(1);
+ }
+
+ files, err = dir.Readdir(0);
+ if err != nil {
+ log.Printf("Error reading %s\n", p_dir);
+ os.Exit(1);
+ }
+
+ for i = 0; i < len(files); i++ {
+ fd, err = os.Open(fmt.Sprintf("%s/%s", dir_info.Name(), files[i].Name()));
+ fd_info, err = fd.Stat();
+ if err != nil {
+ log.Printf("Error getting info\n");
+ os.Exit(1);
+ }
+ fname = fd_info.Name();
+
+ if err != nil {
+ log.Printf("Error reading %s/%s\n", dir_info.Name(), files[i].Name());
+ } else {
+ fmt.Printf("Indexing %s...\n", fname);
+ doc, err = parseDoc(fd, fd_info);
+ if err != nil {
+ log.Printf("Error parsing %s/%s\n", dir_info.Name(), files[i].Name());
+ } else {
+ /* Text */
+ for j = 0; j < len(doc.text); j++ {
+ w = strings.ToLower(doc.text[j]);
+
+ if words[w] == nil{
+ tmp = &index{doc: doc, title: false, freq: 1};
+ words[w] = &wordList{this: tmp, next: nil};
+ }
+
+ for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{}
+
+ if cur.this.doc.fname == fname {
+ cur.this.freq++
+ } else if cur.next == nil {
+ tmp = &index{doc: doc, title: false, freq: 1};
+ cur.next = &wordList{this: tmp, next: nil};
+ } else {
+ panic(fmt.Sprintf("%v", cur));
+ }
+ }
+ /* Title */
+ for j = 0; j < len(doc.title); j++ {
+ w = strings.ToLower(doc.title[j]);
+
+ if words[w] == nil{
+ tmp = &index{doc: doc, title: true, freq: 1};
+ words[w] = &wordList{this: tmp, next: nil};
+ }
+
+ for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{}
+
+ if cur.this.doc.fname == fname {
+ cur.this.title = true;
+ cur.this.freq++;
+ } else if cur.next == nil {
+ tmp = &index{doc: doc, title: true, freq: 1};
+ cur.next = &wordList{this: tmp, next: nil};
+ } else {
+ panic(fmt.Sprintf("%v", cur));
+ }
+ }
+ }
+ }
+ fd.Close();
+ }
+ sorted = make([]wordSort, len(words));
+ i = 0;
+ for k,v := range words {
+ sorted[i].w = k;
+ sorted[i].root = v;
+ i++;
+ }
+
+ sort.Slice(sorted, func(i, j int) bool {
+ return sorted[i].w < sorted[j].w;
+ });
+
+ fd,_ = os.Create("index.dat");
+ printIndex(sorted, fd);
+ fd.Close();
+}