aboutsummaryrefslogtreecommitdiff
path: root/search
diff options
context:
space:
mode:
Diffstat (limited to 'search')
-rw-r--r--search/index/index.go66
-rw-r--r--search/indexer.go152
-rw-r--r--search/search.go127
3 files changed, 299 insertions, 46 deletions
diff --git a/search/index/index.go b/search/index/index.go
index 81b50e0..5d8ab65 100644
--- a/search/index/index.go
+++ b/search/index/index.go
@@ -19,34 +19,34 @@ import "strconv"
* Types *
*********/
-type f_info struct {
- word string;
- in_title bool;
- freq float64;
+type F_info struct {
+ Word string;
+ In_title bool;
+ Freq float64;
};
-type i_info struct {
- doc string;
- in_title bool;
- freq float64;
+type I_info struct {
+ Doc string;
+ In_title bool;
+ Freq float64;
};
-type f_entry struct{
- this *f_info;
- next *f_entry;
+type F_entry struct{
+ This *F_info;
+ Next *F_entry;
};
-type i_entry struct{
- this *i_info;
- next *i_entry;
+type I_entry struct{
+ This *I_info;
+ Next *I_entry;
};
-type F_index map[string]*f_entry;
-type I_index map[string]*i_entry;
+type F_index map[string]*F_entry;
+type I_index map[string]*I_entry;
type sortInverted struct{
w string;
- root *i_entry;
+ root *I_entry;
};
@@ -54,7 +54,7 @@ type sortInverted struct{
* Forward Index Funcitons *
***************************/
-func NewForwardEntryStrings(text, title []string) (*f_entry, error) {
+func NewForwardEntryStrings(text, title []string) (*F_entry, error) {
return nil, errors.New("not implemented");
}
@@ -62,8 +62,8 @@ func NewForwardEntryStrings(text, title []string) (*f_entry, error) {
* Inverted Index Functions *
****************************/
-func new_i_info() *i_info{
- return &i_info{"", false, 0.0};
+func new_I_info() *I_info{
+ return &I_info{"", false, 0.0};
}
func NewInvertedIndexFromFile(fname string) (I_index, error) {
@@ -71,8 +71,8 @@ func NewInvertedIndexFromFile(fname string) (I_index, error) {
var br *bufio.Reader;
var err error;
var buf []byte;
- var tmp *i_info;
- var cur *i_entry;
+ var tmp *I_info;
+ var cur *I_entry;
var index I_index;
var word string
var info []string;
@@ -90,7 +90,7 @@ func NewInvertedIndexFromFile(fname string) (I_index, error) {
index = make(I_index);
for buf, err = br.ReadBytes('\n'); err != io.EOF; buf, err = br.ReadBytes('\n'){
- tmp = new_i_info();
+ tmp = new_I_info();
if err != nil {
return nil, err;
}
@@ -98,17 +98,17 @@ func NewInvertedIndexFromFile(fname string) (I_index, error) {
word = strings.TrimSpace(string(buf));
} else {
info = strings.Fields(string(buf));
- tmp.doc = info[0];
- tmp.in_title = (info[1] == "1");
- tmp.freq, _ = strconv.ParseFloat(info[2], 32);
+ tmp.Doc = info[0];
+ tmp.In_title = (info[1] == "1");
+ tmp.Freq, _ = strconv.ParseFloat(info[2], 32);
if (index[word] == nil) {
- index[word] = &i_entry{this: tmp, next: nil};
+ index[word] = &I_entry{This: tmp, Next: nil};
} else {
cur = index[word];
- for cur.next != nil {
- cur = cur.next;
+ for cur.Next != nil {
+ cur = cur.Next;
}
- cur.next = &i_entry{this: tmp, next: nil};
+ cur.Next = &I_entry{This: tmp, Next: nil};
}
}
}
@@ -123,15 +123,15 @@ func NewInvertedFromForward(f F_index) (I_index, error) {
func (x I_index) PrintToFile(fd *os.File) error{
var i int;
- var cur *i_entry;
+ var cur *I_entry;
var index []sortInverted;
index = x.sortIndex();
for i = 0; i < len(index); i++ {
fmt.Fprintf(fd, "%s\n", index[i].w);
- for cur = index[i].root; cur != nil; cur = cur.next {
- fmt.Fprintf(fd, "\t%s %d %.3f\n", cur.this.doc, toInt(cur.this.in_title), cur.this.freq);
+ for cur = index[i].root; cur != nil; cur = cur.Next {
+ fmt.Fprintf(fd, "\t%s %d %.3f\n", cur.This.Doc, toInt(cur.This.In_title), cur.This.Freq);
}
}
return nil;
diff --git a/search/indexer.go b/search/indexer.go
index ee36c9a..9550c86 100644
--- a/search/indexer.go
+++ b/search/indexer.go
@@ -16,6 +16,7 @@ type document struct {
fname string;
title []string;
text []string;
+ length int;
}
type index struct {
@@ -34,11 +35,12 @@ type wordList struct {
next *wordList
}
-var r, nonAN, stopWords *regexp.Regexp;
+var r, nonAN *regexp.Regexp;
+var stopWords []*regexp.Regexp;
func newDocument() *document {
- return &document{"" , nil, nil};
+ return &document{"" , nil, nil, 0};
}
func RemoveNode(r, rn *html.Node) {
@@ -84,6 +86,7 @@ func parseDoc(fd *os.File, f_info os.FileInfo) (*document, error) {
var doc *goquery.Document;
var body, title *goquery.Selection;
var r_doc *document;
+ var i int;
doc, err = goquery.NewDocumentFromReader(fd);
if err != nil {
@@ -103,21 +106,29 @@ func parseDoc(fd *os.File, f_info os.FileInfo) (*document, error) {
text = r.ReplaceAllString(text, "> <");
- t_text = r.ReplaceAllString(text, "> <");
+ t_text = r.ReplaceAllString(t_text, "> <");
text = sanitize.HTML(text);
t_text = sanitize.HTML(t_text);
+ text = strings.ToLower(text);
+ t_text = strings.ToLower(t_text);
+
text = nonAN.ReplaceAllString(text, " ");
t_text = nonAN.ReplaceAllString(t_text, " ");
- text = stopWords.ReplaceAllString(text, "");
- t_text = stopWords.ReplaceAllString(t_text, "");
+ for i = 0; i < len(stopWords); i++ {
+ text = stopWords[i].ReplaceAllString(text, " ");
+ t_text = stopWords[i].ReplaceAllString(t_text, " ");
+ }
r_doc = newDocument();
+
r_doc.fname = f_info.Name();
- r_doc.text = strings.Fields(sanitize.HTML(text));
- r_doc.title = strings.Fields(sanitize.HTML(t_text));
+ r_doc.text = strings.Fields(text);
+ r_doc.title = strings.Fields(t_text);
+ r_doc.length = len(r_doc.text) + len(r_doc.title);
+ fmt.Println(r_doc.length)
return r_doc, nil;
}
@@ -140,7 +151,7 @@ func printIndex(words []wordSort, fd *os.File) {
for cur = words[i].root; cur != nil; cur = cur.next {
fname = cur.this.doc.fname;
t = boolToInt(cur.this.title);
- freq = float64(cur.this.freq) / float64(len(cur.this.doc.text));
+ freq = float64(cur.this.freq) / float64(cur.this.doc.length);
fmt.Fprintf(fd,"\t%s %d %.3f\n", fname, t, freq);
}
@@ -148,10 +159,125 @@ func printIndex(words []wordSort, fd *os.File) {
}
func init() {
+ var err error;
log.SetOutput(os.Stderr);
- r, _ = regexp.Compile("><");
- nonAN, _ = regexp.Compile("[^a-zA-Z0-9]+");
- stopWords, _ = regexp.Compile("( and\\W)|( a\\W)|( an\\W)|( and\\W)|( are\\W)|( as\\W)|( at\\W)|( be\\W)|( by\\W)|( for\\W)|( from\\W)|( has\\W)|( he\\W)|( in\\W)|( is\\W)|( it\\W)|( its\\W)|( of\\W)|( on\\W)|( that\\W)|( the\\W)|( to\\W)|( was\\W)|( were\\W)|( will\\W)|( with\\W)")
+ r, err = regexp.Compile("><");
+ if err != nil {
+ panic(err);
+ }
+ nonAN, err = regexp.Compile("[^a-zA-Z0-9]+");
+ if err != nil {
+ panic(err);
+ }
+ //TODO add func to read in stop words from a file;
+ stopWords = make([]*regexp.Regexp, 26)
+ if err != nil {
+ panic(err);
+ }
+ stopWords[0], err = regexp.Compile("\\W+and\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[1], err = regexp.Compile("\\W+a\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[2], err = regexp.Compile("\\W+an\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[3], err = regexp.Compile("\\W+and\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[4], err = regexp.Compile("\\W+are\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[5], err = regexp.Compile("\\W+as\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[6], err = regexp.Compile("\\W+at\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[7], err = regexp.Compile("\\W+be\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[8], err = regexp.Compile("\\W+by\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[9], err = regexp.Compile("\\W+for\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[10], err = regexp.Compile("\\W+from\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[11], err = regexp.Compile("\\W+has\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[12], err = regexp.Compile("\\W+he\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[13], err = regexp.Compile("\\W+in\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[14], err = regexp.Compile("\\W+is\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[15], err = regexp.Compile("\\W+it\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[16], err = regexp.Compile("\\W+its\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[17], err = regexp.Compile("\\W+of\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[18], err = regexp.Compile("\\W+on\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[19], err = regexp.Compile("\\W+that\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[20], err = regexp.Compile("\\W+the\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[21], err = regexp.Compile("\\W+to\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[22], err = regexp.Compile("\\W+was\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[23], err = regexp.Compile("\\W+were\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[24], err = regexp.Compile("\\W+will\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[25], err = regexp.Compile("\\W+with\\W+");
+ if err != nil {
+ panic(err);
+ }
}
func main() {
@@ -219,7 +345,7 @@ func main() {
w = strings.ToLower(doc.text[j]);
if words[w] == nil{
- tmp = &index{doc: doc, title: false, freq: 1};
+ tmp = &index{doc: doc, title: false, freq: 0};
words[w] = &wordList{this: tmp, next: nil};
}
@@ -239,7 +365,7 @@ func main() {
w = strings.ToLower(doc.title[j]);
if words[w] == nil{
- tmp = &index{doc: doc, title: true, freq: 1};
+ tmp = &index{doc: doc, title: true, freq: 0};
words[w] = &wordList{this: tmp, next: nil};
}
diff --git a/search/search.go b/search/search.go
index 7905807..9c9bb38 100644
--- a/search/search.go
+++ b/search/search.go
@@ -1,5 +1,132 @@
+/************************************************
+ * README *
+ * In order for search/index to be accessible *
+ * you must link this folder (search) into your *
+ * GOPATH *
+ ************************************************/
+
+
package main
+import "search/index"
+import "os"
+import "fmt"
+import "sort"
+
+type res struct {
+ doc string;
+ score float64;
+};
+
func main() {
+ var init_index, sIndex index.I_index;
+ var tmp, results, root *index.I_entry;
+ var tmp_score float64;
+ var scores map[string]map[string]float64; // scores[doc][word] == score
+ var i,j int;
+ var searchBool, perWord, docAdded map[string]bool; //map[doc]bool
+ var resultSort []res;
+ var err error;
+
+ scores = make(map[string]map[string]float64);
+ searchBool = make(map[string]bool);
+ perWord = make(map[string]bool);
+ docAdded = make(map[string]bool);
+
+
+ sIndex = make(index.I_index);
+
+
+
+ init_index, err = index.NewInvertedIndexFromFile("index.dat"); //TODO add flag for filename
+ if err != nil {
+ panic(err)
+ }
+
+ for i = 1; i < len(os.Args); i++ {
+ sIndex[os.Args[i]] = init_index[os.Args[i]]
+ }
+
+ for _, v := range sIndex {
+ for tmp = v; tmp != nil; tmp = tmp.Next {
+ perWord[tmp.This.Doc] = true;
+ searchBool[tmp.This.Doc] = true;
+ scores[tmp.This.Doc] = make(map[string]float64);
+ }
+ }
+
+ for _, v := range sIndex {
+ for tmp = v; tmp != nil; tmp = tmp.Next {
+ perWord[tmp.This.Doc] = true;
+ }
+
+ for d := range searchBool {
+ if _, o := perWord[d]; !o {
+ searchBool[d] = false;
+ }
+ }
+ perWord = make(map[string]bool);
+ }
+
+ for k, v := range sIndex {
+ for tmp = v; tmp != nil; tmp = tmp.Next {
+ if searchBool[tmp.This.Doc] {
+ if tmp.This.In_title {
+ tmp_score = 1.0;
+ } else {
+ tmp_score = 0.0;
+ }
+
+ scores[tmp.This.Doc][k] = (0.9 * tmp.This.Freq) + (0.1 * tmp_score);
+ }
+ }
+
+ }
+
+ i = 0;
+ results = &index.I_entry{nil, nil}
+ root = &index.I_entry{nil, nil};
+ results.Next = root;
+
+ j = 0;
+
+ for _ ,v := range sIndex {
+ for tmp = v; tmp != nil; tmp = tmp.Next {
+ if (searchBool[tmp.This.Doc]) {
+ root.This = tmp.This;
+ docAdded[root.This.Doc] = false;
+ root.Next = &index.I_entry{nil, nil};
+ root = root.Next;
+ j++
+ }
+ }
+ }
+
+ resultSort = make([]res, j);
+
+ i = 0;
+ for root = results.Next; root.Next != nil; root = root.Next {
+ if (!docAdded[root.This.Doc]) {
+ j = 0;
+ tmp_score = 0;
+ for _ ,v := range scores[root.This.Doc] {
+ tmp_score += v;
+ j++;
+ }
+ tmp_score /= float64(j);
+ resultSort[i] = res{root.This.Doc, tmp_score};
+ docAdded[root.This.Doc] = true;
+ i++;
+ }
+ }
+ resultSort = resultSort[:i];
+ sort.Slice(resultSort, func(i, j int) bool {
+ return resultSort[i].score > resultSort[j].score;
+ });
+
+ fmt.Printf("Results: %d\n", len(resultSort));
+ for i = 0; i < len(resultSort); i++ {
+ fmt.Printf("\t%d. Doc: %s, Score: %.3f\n", i, resultSort[i].doc, resultSort[i].score);
+ }
}