aboutsummaryrefslogtreecommitdiff
path: root/search/indexer.go
diff options
context:
space:
mode:
Diffstat (limited to 'search/indexer.go')
-rw-r--r--search/indexer.go402
1 files changed, 0 insertions, 402 deletions
diff --git a/search/indexer.go b/search/indexer.go
deleted file mode 100644
index d95f126..0000000
--- a/search/indexer.go
+++ /dev/null
@@ -1,402 +0,0 @@
-package main
-
-import "os"
-import "sort"
-import "golang.org/x/net/html"
-import "log"
-import "fmt"
-import "github.com/PuerkitoBio/goquery"
-import "github.com/kennygrant/sanitize"
-import "strings"
-import "flag"
-import "errors"
-import "regexp"
-
-type document struct {
- fname string;
- title []string;
- text []string;
- length int;
-}
-
-type index struct {
- doc *document;
- title bool;
- freq int;
-}
-
-type wordSort struct {
- w string;
- root *wordList;
-}
-
-type wordList struct {
- this *index
- next *wordList
-}
-
-var r, nonAN *regexp.Regexp;
-var stopWords []*regexp.Regexp;
-
-
-func newDocument() *document {
- return &document{"" , nil, nil, 0};
-}
-
-func RemoveNode(r, rn *html.Node) {
- var found bool;
- var n, item *html.Node;
- var nodes map[int]*html.Node;
- var i, j int;
-
- found = false;
- nodes = make(map[int]*html.Node);
-
- for n = r.FirstChild; n != nil; n = n.NextSibling {
- if n == rn {
- found = true;
- n.Parent.RemoveChild(n);
- }
-
- nodes[i] = n;
- i++;
- }
-
- if !found {
- for j = 0; j < i; j++ {
- item = nodes[j];
- RemoveNode(item, rn);
- }
- }
-}
-func RemoveTag(doc *goquery.Selection, tag string) {
- doc.Find(tag).Each(func(i int, s *goquery.Selection) {
- RemoveNode(doc.Get(0), s.Get(0));
- });
-}
-
-func logReg(h []byte) []byte {
- log.Printf("RegExp: %s", h);
- return h;
-}
-
-func parseDoc(fd *os.File, f_info os.FileInfo) (*document, error) {
- var err error;
- var text, t_text string;
- var doc *goquery.Document;
- var body, title *goquery.Selection;
- var r_doc *document;
- var i int;
-
- doc, err = goquery.NewDocumentFromReader(fd);
- if err != nil {
- log.Printf("goquery error: %s\n", err);
- return nil, errors.New("Can't create goquery documnt");
- }
-
- body = doc.Find("body");
- RemoveTag(body, "script");
- RemoveTag(body, "noscript");
-
- title = doc.Find("title");
-
- //TODO add error detection
- text, err = body.Html();
- t_text, err = title.Html();
-
-
- text = r.ReplaceAllString(text, "> <");
- t_text = r.ReplaceAllString(t_text, "> <");
-
- text = sanitize.HTML(text);
- t_text = sanitize.HTML(t_text);
-
- text = strings.ToLower(text);
- t_text = strings.ToLower(t_text);
-
- text = nonAN.ReplaceAllString(text, " ");
- t_text = nonAN.ReplaceAllString(t_text, " ");
-
-
- for i = 0; i < len(stopWords); i++ {
- text = stopWords[i].ReplaceAllString(text, " ");
- t_text = stopWords[i].ReplaceAllString(t_text, " ");
- }
- r_doc = newDocument();
-
- r_doc.fname = f_info.Name();
- r_doc.text = strings.Fields(text);
- r_doc.title = strings.Fields(t_text);
- r_doc.length = len(r_doc.text) + len(r_doc.title);
-
- return r_doc, nil;
-}
-func boolToInt(t bool) int {
- if t {
- return 1;
- }
- return 0;
-}
-
-func printIndex(words []wordSort, fd *os.File) {
- var i int;
- var cur *wordList;
- var fname string;
- var t int;
- var freq float64;
-
- for i = 0; i < len(words); i++ {
- fmt.Fprintf(fd, "%s\n", words[i].w);
- for cur = words[i].root; cur != nil; cur = cur.next {
- fname = cur.this.doc.fname;
- t = boolToInt(cur.this.title);
- freq = float64(cur.this.freq) / float64(cur.this.doc.length);
-
- fmt.Fprintf(fd,"\t%s %d %.3f\n", fname, t, freq);
- }
- }
-}
-
-func init() {
- var err error;
- log.SetOutput(os.Stderr);
- r, err = regexp.Compile("><");
- if err != nil {
- panic(err);
- }
- nonAN, err = regexp.Compile("[^a-zA-Z0-9]+");
- if err != nil {
- panic(err);
- }
- //TODO add func to read in stop words from a file;
- stopWords = make([]*regexp.Regexp, 26)
- if err != nil {
- panic(err);
- }
- stopWords[0], err = regexp.Compile("\\W+and\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[1], err = regexp.Compile("\\W+a\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[2], err = regexp.Compile("\\W+an\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[3], err = regexp.Compile("\\W+and\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[4], err = regexp.Compile("\\W+are\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[5], err = regexp.Compile("\\W+as\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[6], err = regexp.Compile("\\W+at\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[7], err = regexp.Compile("\\W+be\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[8], err = regexp.Compile("\\W+by\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[9], err = regexp.Compile("\\W+for\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[10], err = regexp.Compile("\\W+from\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[11], err = regexp.Compile("\\W+has\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[12], err = regexp.Compile("\\W+he\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[13], err = regexp.Compile("\\W+in\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[14], err = regexp.Compile("\\W+is\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[15], err = regexp.Compile("\\W+it\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[16], err = regexp.Compile("\\W+its\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[17], err = regexp.Compile("\\W+of\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[18], err = regexp.Compile("\\W+on\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[19], err = regexp.Compile("\\W+that\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[20], err = regexp.Compile("\\W+the\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[21], err = regexp.Compile("\\W+to\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[22], err = regexp.Compile("\\W+was\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[23], err = regexp.Compile("\\W+were\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[24], err = regexp.Compile("\\W+will\\W+");
- if err != nil {
- panic(err);
- }
- stopWords[25], err = regexp.Compile("\\W+with\\W+");
- if err != nil {
- panic(err);
- }
-}
-
-func main() {
- // var words map[string]index
- var p_dir, w, fname string;
- var err error;
- var i, j int;
- var words map[string]*wordList;
- var cur *wordList;
- var tmp *index;
- var sorted []wordSort;
-
- var files []os.FileInfo;
- var dir, fd *os.File;
- var dir_info, fd_info os.FileInfo;
- var dir_mode os.FileMode;
-
- var doc *document;
-
- flag.StringVar(&p_dir, "d", "./pages", "pages directory");
-
- flag.Parse();
-
- words = make(map[string]*wordList);
-
- dir, err = os.Open(p_dir);
- if err != nil {
- log.Printf("Error accessing \"%s\":\t%s\n", p_dir, err);
- os.Exit(1);
- }
-
- dir_info, err = dir.Stat();
- dir_mode = dir_info.Mode();
-
- if !dir_mode.IsDir() {
- log.Printf("\"%s\" is not a directory\n", p_dir);
- os.Exit(1);
- }
-
- files, err = dir.Readdir(0);
- if err != nil {
- log.Printf("Error reading %s\n", p_dir);
- os.Exit(1);
- }
-
- for i = 0; i < len(files); i++ {
- fd, err = os.Open(fmt.Sprintf("%s/%s", dir_info.Name(), files[i].Name()));
- fd_info, err = fd.Stat();
- if err != nil {
- log.Printf("Error getting info\n");
- os.Exit(1);
- }
- fname = fd_info.Name();
-
- if err != nil {
- log.Printf("Error reading %s/%s\n", dir_info.Name(), files[i].Name());
- } else {
- fmt.Printf("Indexing %s...\n", fname);
- doc, err = parseDoc(fd, fd_info);
- if err != nil {
- log.Printf("Error parsing %s/%s\n", dir_info.Name(), files[i].Name());
- } else {
- /* Text */
- for j = 0; j < len(doc.text); j++ {
- w = strings.ToLower(doc.text[j]);
-
- if words[w] == nil{
- tmp = &index{doc: doc, title: false, freq: 0};
- words[w] = &wordList{this: tmp, next: nil};
- }
-
- for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{}
-
- if cur.this.doc.fname == fname {
- cur.this.freq++
- } else if cur.next == nil {
- tmp = &index{doc: doc, title: false, freq: 1};
- cur.next = &wordList{this: tmp, next: nil};
- } else {
- panic(fmt.Sprintf("%v", cur));
- }
- }
- /* Title */
- for j = 0; j < len(doc.title); j++ {
- w = strings.ToLower(doc.title[j]);
-
- if words[w] == nil{
- tmp = &index{doc: doc, title: true, freq: 0};
- words[w] = &wordList{this: tmp, next: nil};
- }
-
- for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{}
-
- if cur.this.doc.fname == fname {
- cur.this.title = true;
- cur.this.freq++;
- } else if cur.next == nil {
- tmp = &index{doc: doc, title: true, freq: 1};
- cur.next = &wordList{this: tmp, next: nil};
- } else {
- panic(fmt.Sprintf("%v", cur));
- }
- }
- }
- }
- fd.Close();
- }
- sorted = make([]wordSort, len(words));
- i = 0;
- for k,v := range words {
- sorted[i].w = k;
- sorted[i].root = v;
- i++;
- }
-
- sort.Slice(sorted, func(i, j int) bool {
- return sorted[i].w < sorted[j].w;
- });
-
- fd,_ = os.Create("index.dat");
- printIndex(sorted, fd);
- fd.Close();
-}