aboutsummaryrefslogtreecommitdiff
path: root/CSC2636
diff options
context:
space:
mode:
Diffstat (limited to 'CSC2636')
-rw-r--r--CSC2636/search/.gitignore6
-rw-r--r--CSC2636/search/README.rst19
-rw-r--r--CSC2636/search/assign.rst213
-rw-r--r--CSC2636/search/index/index.go165
-rw-r--r--CSC2636/search/indexer.go402
-rw-r--r--CSC2636/search/search.go144
-rw-r--r--CSC2636/webCrawler2/README.rst13
-rw-r--r--CSC2636/webCrawler2/crawler.go164
8 files changed, 1126 insertions, 0 deletions
diff --git a/CSC2636/search/.gitignore b/CSC2636/search/.gitignore
new file mode 100644
index 0000000..7523492
--- /dev/null
+++ b/CSC2636/search/.gitignore
@@ -0,0 +1,6 @@
+*test*
+pages
+index.dat
+indexer
+search
+*.swp
diff --git a/CSC2636/search/README.rst b/CSC2636/search/README.rst
new file mode 100644
index 0000000..e1d14fb
--- /dev/null
+++ b/CSC2636/search/README.rst
@@ -0,0 +1,19 @@
+=============
+Search Engine
+=============
+
+Setup
+=====
+In order for search.go to use the index package the directory "index"
+must by copied (or linked) into a directory "search" that is in your
+GOPATH.
+
+About
+=====
+Search Engine for web science class.
+
+See assign.rst for assignment details.
+
+Authors
+=======
+- Tucker Evans
diff --git a/CSC2636/search/assign.rst b/CSC2636/search/assign.rst
new file mode 100644
index 0000000..66e537e
--- /dev/null
+++ b/CSC2636/search/assign.rst
@@ -0,0 +1,213 @@
+========================
+Project 2: Search Engine
+========================
+
+**CS2621– Web Science**
+
+*100 points*
+
+You are to create a web search engine that works at the command line.
+To do this, you will write two Python scripts, indexer.py and
+search.py.
+
+Indexer
+=======
+
+Indexer.py should do the following:
+
+1. After performing a crawl (using your other Python script), read all
+ the HTML files that were stored in the “pages” directory. For each
+ document, extract the title and the text from the body of the page
+ (read the Beautiful Soup documentation to find out how). Beautiful
+ Soup will include the text of the page in the content of the page,
+ and that is OK. Beautiful Soup may also break on some pages and
+ include HTML as text, but we will not worry about these
+ exceptions or bugs.
+
+2. All text should be converted to lowercase and non-alphanumeric
+ characters should be ignored. So “123-456” would become “123” and
+ “456”, and “joe@yahoo.com” would become “joe”, “yahoo”, “com”.
+ Ignore the following stop words: a, an, and, are, as, at, be, by,
+ for, from, has, he, in, is, it, its, of, on, that, the, to, was,
+ were, will, with. Do not perform stemming.
+
+3. A single inverted index should be created for the document corpus
+ which maintains the document ID (numbered 1…n in order of the pages
+ found in the “pages” directory), a 1 or 0 if the text is found in
+ the title, and the term frequency from the body (normalized by the
+ total number of tokens in the document after removing stop words).
+
+4. After indexer.py has finished indexing all the web pages, it should
+ output the index to index.dat which looks likethis:
+
+::
+
+ arkansas
+ 6 0 0.022
+ model
+ 1 0 0.309
+ 3 0 0.015
+ 5 1 0.001
+ tuesday
+ 2 0 0.082
+ white
+ 2 1 0.018
+ etc…
+
+.. note ::
+ The indexed words are alphabetized, and there are 3 spaces before
+ sets of three numbers (each separated by a single space) which are:
+ doc ID, title (0 or 1), and normalized body TF (rounded to 3 decimal
+ places). For example, the term white was found only in document 2;
+ it was somewhere in the title and made up 1.8% of all the words in
+ the document.
+
+5. It may take some time for your program to run, so you should output
+ information about the program’s status as it indexes the crawled
+ pages. Outputting what file is being worked on would be helpful to
+ the user who is waiting for the program to finish its work.
+
+Search
+======
+
+After the index is written to index.dat, the search.py script will
+allow the user to search the corpus for specific words. Here is how
+it should operate:
+
+1. First, read the search phrase at the command line. Examples:
+
+ .. code :: bash
+
+ $ search.py bisons
+ $ search.py "landmark college"
+
+If no command line argument is supplied, the program should tell the
+user a search term is required and terminate. Ignore any command-line
+arguments after the first.
+
+2. Next, the program should read the index from index.dat into memory.
+ Note that you may want to use similar data structures used in
+ indexer.py, so you should write your programs in a way where you
+ share code without having redundant code in each script. (It’s OK
+ to introduce new .py files to your project.)
+
+3. For simplicity, all queries will be assumed to use boolean ANDs,
+ and we will not implement phrase search. For example, the query
+ landmark college should generate a boolean search for landmark AND
+ college, so only documents containing both terms should be
+ considered amatch.
+
+4. Remove any stop words from the query as was done when indexing the
+ documents.
+
+5. After determining which documents match the search terms, calculate
+ the relevancy score for each document: relevancy score = 0.9 * body
+ TF + 0.1 * title score Do this for each term, and compute the
+ average relevancy score for all terms. So if the search was for
+ landmark college, you would compute the score for landmark and the
+ score for college and compute the average to determine the overall
+ relevancy score.
+
+6. The total number of results should first be displayed. Then display
+ every document ID and score (out to 3 decimal places) ordered by
+ score, and number the results. Example:
+
+.. code:: bash
+
+ Results: 4
+ 1. docID, 3, score, 0.830
+ 2. docID, 1, score, 0.814
+ 3. docID, 5, score, 0.350
+ 4. docID, 8, score, 0.108
+
+**Bonus:** You can receive 5 bonus points by implementing phrase search.
+So when the user searches for “landmark college”, assume they want
+only documents with that exact phrase. To accomplish this, you will
+need to store the positions of the terms that are stored in the
+inverted index. Then use those positions to ensure the phrase matches
+successive positions.
+
+
+Zip your entire project directory and submit it
+to Canvas before it is due. Make sure your output matches the
+specifications precisely to avoid losing any points. If you use any
+code you find in the Web, you must document the source in your
+program.
+
+Test Data
+=========
+
+*a.html*
+
+.. code:: html
+
+ <title>cool!!! test!!!</title>
+ <body>
+ this 123-456.
+ </body>
+
+*b.html*
+
+.. code:: html
+
+ <html>
+ <head>
+ <title>Go Patriots!</title>
+ </head>
+ <body>
+ And another test and test!
+ </body>
+ </html>
+
+*c.html*
+
+.. code:: html
+
+ <body>
+ This is a test.
+ </body>
+
+*Inverted index:*
+
+.. code::
+
+ 123
+ a 0 0.200
+ 456
+ a 0 0.200
+ another
+ b 0 0.200
+ cool
+ a 1 0.200
+ patriots
+ b 1 0.200
+ go
+ b 1 0.200
+ test
+ a 1 0.200
+ c 0 0.500
+ b 0 0.400
+ this
+ a 0 0.200
+ c 0 0.500
+
+Search for "test this" results in the following:
+
+::
+
+ Results: 2
+ 1. docID c, score 0.450
+ 2. docID a, score 0.230
+
+Search for "test patriots go" results in the following:
+
+::
+
+ Results: 1
+ 1. docID b, score 0.310
+
+Search for "cool patriots" results in the following:
+
+::
+
+ Results: 0
diff --git a/CSC2636/search/index/index.go b/CSC2636/search/index/index.go
new file mode 100644
index 0000000..5d8ab65
--- /dev/null
+++ b/CSC2636/search/index/index.go
@@ -0,0 +1,165 @@
+package index
+
+import "fmt"
+import "os"
+import "io"
+import "bufio"
+import "sort"
+import "errors"
+import "strings"
+import "strconv"
+
+/* TODO
+
+ - Implement Forward Creation
+ - Implement Inverted from Forward
+ - Switch Indexer.go over to this package
+
+/*********
+ * Types *
+ *********/
+
+type F_info struct {
+ Word string;
+ In_title bool;
+ Freq float64;
+};
+
+type I_info struct {
+ Doc string;
+ In_title bool;
+ Freq float64;
+};
+
+type F_entry struct{
+ This *F_info;
+ Next *F_entry;
+};
+
+type I_entry struct{
+ This *I_info;
+ Next *I_entry;
+};
+
+type F_index map[string]*F_entry;
+type I_index map[string]*I_entry;
+
+type sortInverted struct{
+ w string;
+ root *I_entry;
+};
+
+
+/***************************
+ * Forward Index Funcitons *
+ ***************************/
+
+func NewForwardEntryStrings(text, title []string) (*F_entry, error) {
+ return nil, errors.New("not implemented");
+}
+
+/****************************
+ * Inverted Index Functions *
+ ****************************/
+
+func new_I_info() *I_info{
+ return &I_info{"", false, 0.0};
+}
+
+func NewInvertedIndexFromFile(fname string) (I_index, error) {
+ var fd *os.File;
+ var br *bufio.Reader;
+ var err error;
+ var buf []byte;
+ var tmp *I_info;
+ var cur *I_entry;
+ var index I_index;
+ var word string
+ var info []string;
+
+ fd, err = os.Open(fname);
+ if err != nil {
+ return nil, err;
+ }
+
+ br = bufio.NewReader(fd);
+ if br == nil {
+ return nil, errors.New("Could not initialize reader");
+ }
+
+ index = make(I_index);
+
+ for buf, err = br.ReadBytes('\n'); err != io.EOF; buf, err = br.ReadBytes('\n'){
+ tmp = new_I_info();
+ if err != nil {
+ return nil, err;
+ }
+ if buf[0] != '\t' {
+ word = strings.TrimSpace(string(buf));
+ } else {
+ info = strings.Fields(string(buf));
+ tmp.Doc = info[0];
+ tmp.In_title = (info[1] == "1");
+ tmp.Freq, _ = strconv.ParseFloat(info[2], 32);
+ if (index[word] == nil) {
+ index[word] = &I_entry{This: tmp, Next: nil};
+ } else {
+ cur = index[word];
+ for cur.Next != nil {
+ cur = cur.Next;
+ }
+ cur.Next = &I_entry{This: tmp, Next: nil};
+ }
+ }
+ }
+
+ return index, nil;
+}
+
+func NewInvertedFromForward(f F_index) (I_index, error) {
+ return nil, errors.New("not implemented");
+
+}
+
+func (x I_index) PrintToFile(fd *os.File) error{
+ var i int;
+ var cur *I_entry;
+ var index []sortInverted;
+
+ index = x.sortIndex();
+
+ for i = 0; i < len(index); i++ {
+ fmt.Fprintf(fd, "%s\n", index[i].w);
+ for cur = index[i].root; cur != nil; cur = cur.Next {
+ fmt.Fprintf(fd, "\t%s %d %.3f\n", cur.This.Doc, toInt(cur.This.In_title), cur.This.Freq);
+ }
+ }
+ return nil;
+}
+
+func toInt(t bool) int{
+ if (t){
+ return 1;
+ }
+ return 0;
+}
+
+func (unsort I_index) sortIndex() []sortInverted {
+ var i int;
+ var sorted []sortInverted;
+
+ sorted = make([]sortInverted, len(unsort));
+
+ i = 0;
+ for k, v := range unsort {
+ sorted[i].w = k;
+ sorted[i].root = v;
+ i++;
+ }
+
+ sort.Slice(sorted, func(i, j int) bool {
+ return sorted[i].w < sorted[j].w;
+ });
+
+ return sorted
+}
diff --git a/CSC2636/search/indexer.go b/CSC2636/search/indexer.go
new file mode 100644
index 0000000..d95f126
--- /dev/null
+++ b/CSC2636/search/indexer.go
@@ -0,0 +1,402 @@
+package main
+
+import "os"
+import "sort"
+import "golang.org/x/net/html"
+import "log"
+import "fmt"
+import "github.com/PuerkitoBio/goquery"
+import "github.com/kennygrant/sanitize"
+import "strings"
+import "flag"
+import "errors"
+import "regexp"
+
+type document struct {
+ fname string;
+ title []string;
+ text []string;
+ length int;
+}
+
+type index struct {
+ doc *document;
+ title bool;
+ freq int;
+}
+
+type wordSort struct {
+ w string;
+ root *wordList;
+}
+
+type wordList struct {
+ this *index
+ next *wordList
+}
+
+var r, nonAN *regexp.Regexp;
+var stopWords []*regexp.Regexp;
+
+
+func newDocument() *document {
+ return &document{"" , nil, nil, 0};
+}
+
+func RemoveNode(r, rn *html.Node) {
+ var found bool;
+ var n, item *html.Node;
+ var nodes map[int]*html.Node;
+ var i, j int;
+
+ found = false;
+ nodes = make(map[int]*html.Node);
+
+ for n = r.FirstChild; n != nil; n = n.NextSibling {
+ if n == rn {
+ found = true;
+ n.Parent.RemoveChild(n);
+ }
+
+ nodes[i] = n;
+ i++;
+ }
+
+ if !found {
+ for j = 0; j < i; j++ {
+ item = nodes[j];
+ RemoveNode(item, rn);
+ }
+ }
+}
+func RemoveTag(doc *goquery.Selection, tag string) {
+ doc.Find(tag).Each(func(i int, s *goquery.Selection) {
+ RemoveNode(doc.Get(0), s.Get(0));
+ });
+}
+
+func logReg(h []byte) []byte {
+ log.Printf("RegExp: %s", h);
+ return h;
+}
+
+func parseDoc(fd *os.File, f_info os.FileInfo) (*document, error) {
+ var err error;
+ var text, t_text string;
+ var doc *goquery.Document;
+ var body, title *goquery.Selection;
+ var r_doc *document;
+ var i int;
+
+ doc, err = goquery.NewDocumentFromReader(fd);
+ if err != nil {
+ log.Printf("goquery error: %s\n", err);
+ return nil, errors.New("Can't create goquery documnt");
+ }
+
+ body = doc.Find("body");
+ RemoveTag(body, "script");
+ RemoveTag(body, "noscript");
+
+ title = doc.Find("title");
+
+ //TODO add error detection
+ text, err = body.Html();
+ t_text, err = title.Html();
+
+
+ text = r.ReplaceAllString(text, "> <");
+ t_text = r.ReplaceAllString(t_text, "> <");
+
+ text = sanitize.HTML(text);
+ t_text = sanitize.HTML(t_text);
+
+ text = strings.ToLower(text);
+ t_text = strings.ToLower(t_text);
+
+ text = nonAN.ReplaceAllString(text, " ");
+ t_text = nonAN.ReplaceAllString(t_text, " ");
+
+
+ for i = 0; i < len(stopWords); i++ {
+ text = stopWords[i].ReplaceAllString(text, " ");
+ t_text = stopWords[i].ReplaceAllString(t_text, " ");
+ }
+ r_doc = newDocument();
+
+ r_doc.fname = f_info.Name();
+ r_doc.text = strings.Fields(text);
+ r_doc.title = strings.Fields(t_text);
+ r_doc.length = len(r_doc.text) + len(r_doc.title);
+
+ return r_doc, nil;
+}
+func boolToInt(t bool) int {
+ if t {
+ return 1;
+ }
+ return 0;
+}
+
+func printIndex(words []wordSort, fd *os.File) {
+ var i int;
+ var cur *wordList;
+ var fname string;
+ var t int;
+ var freq float64;
+
+ for i = 0; i < len(words); i++ {
+ fmt.Fprintf(fd, "%s\n", words[i].w);
+ for cur = words[i].root; cur != nil; cur = cur.next {
+ fname = cur.this.doc.fname;
+ t = boolToInt(cur.this.title);
+ freq = float64(cur.this.freq) / float64(cur.this.doc.length);
+
+ fmt.Fprintf(fd,"\t%s %d %.3f\n", fname, t, freq);
+ }
+ }
+}
+
+func init() {
+ var err error;
+ log.SetOutput(os.Stderr);
+ r, err = regexp.Compile("><");
+ if err != nil {
+ panic(err);
+ }
+ nonAN, err = regexp.Compile("[^a-zA-Z0-9]+");
+ if err != nil {
+ panic(err);
+ }
+ //TODO add func to read in stop words from a file;
+ stopWords = make([]*regexp.Regexp, 26)
+ if err != nil {
+ panic(err);
+ }
+ stopWords[0], err = regexp.Compile("\\W+and\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[1], err = regexp.Compile("\\W+a\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[2], err = regexp.Compile("\\W+an\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[3], err = regexp.Compile("\\W+and\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[4], err = regexp.Compile("\\W+are\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[5], err = regexp.Compile("\\W+as\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[6], err = regexp.Compile("\\W+at\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[7], err = regexp.Compile("\\W+be\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[8], err = regexp.Compile("\\W+by\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[9], err = regexp.Compile("\\W+for\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[10], err = regexp.Compile("\\W+from\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[11], err = regexp.Compile("\\W+has\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[12], err = regexp.Compile("\\W+he\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[13], err = regexp.Compile("\\W+in\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[14], err = regexp.Compile("\\W+is\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[15], err = regexp.Compile("\\W+it\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[16], err = regexp.Compile("\\W+its\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[17], err = regexp.Compile("\\W+of\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[18], err = regexp.Compile("\\W+on\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[19], err = regexp.Compile("\\W+that\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[20], err = regexp.Compile("\\W+the\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[21], err = regexp.Compile("\\W+to\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[22], err = regexp.Compile("\\W+was\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[23], err = regexp.Compile("\\W+were\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[24], err = regexp.Compile("\\W+will\\W+");
+ if err != nil {
+ panic(err);
+ }
+ stopWords[25], err = regexp.Compile("\\W+with\\W+");
+ if err != nil {
+ panic(err);
+ }
+}
+
+func main() {
+ // var words map[string]index
+ var p_dir, w, fname string;
+ var err error;
+ var i, j int;
+ var words map[string]*wordList;
+ var cur *wordList;
+ var tmp *index;
+ var sorted []wordSort;
+
+ var files []os.FileInfo;
+ var dir, fd *os.File;
+ var dir_info, fd_info os.FileInfo;
+ var dir_mode os.FileMode;
+
+ var doc *document;
+
+ flag.StringVar(&p_dir, "d", "./pages", "pages directory");
+
+ flag.Parse();
+
+ words = make(map[string]*wordList);
+
+ dir, err = os.Open(p_dir);
+ if err != nil {
+ log.Printf("Error accessing \"%s\":\t%s\n", p_dir, err);
+ os.Exit(1);
+ }
+
+ dir_info, err = dir.Stat();
+ dir_mode = dir_info.Mode();
+
+ if !dir_mode.IsDir() {
+ log.Printf("\"%s\" is not a directory\n", p_dir);
+ os.Exit(1);
+ }
+
+ files, err = dir.Readdir(0);
+ if err != nil {
+ log.Printf("Error reading %s\n", p_dir);
+ os.Exit(1);
+ }
+
+ for i = 0; i < len(files); i++ {
+ fd, err = os.Open(fmt.Sprintf("%s/%s", dir_info.Name(), files[i].Name()));
+ fd_info, err = fd.Stat();
+ if err != nil {
+ log.Printf("Error getting info\n");
+ os.Exit(1);
+ }
+ fname = fd_info.Name();
+
+ if err != nil {
+ log.Printf("Error reading %s/%s\n", dir_info.Name(), files[i].Name());
+ } else {
+ fmt.Printf("Indexing %s...\n", fname);
+ doc, err = parseDoc(fd, fd_info);
+ if err != nil {
+ log.Printf("Error parsing %s/%s\n", dir_info.Name(), files[i].Name());
+ } else {
+ /* Text */
+ for j = 0; j < len(doc.text); j++ {
+ w = strings.ToLower(doc.text[j]);
+
+ if words[w] == nil{
+ tmp = &index{doc: doc, title: false, freq: 0};
+ words[w] = &wordList{this: tmp, next: nil};
+ }
+
+ for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{}
+
+ if cur.this.doc.fname == fname {
+ cur.this.freq++
+ } else if cur.next == nil {
+ tmp = &index{doc: doc, title: false, freq: 1};
+ cur.next = &wordList{this: tmp, next: nil};
+ } else {
+ panic(fmt.Sprintf("%v", cur));
+ }
+ }
+ /* Title */
+ for j = 0; j < len(doc.title); j++ {
+ w = strings.ToLower(doc.title[j]);
+
+ if words[w] == nil{
+ tmp = &index{doc: doc, title: true, freq: 0};
+ words[w] = &wordList{this: tmp, next: nil};
+ }
+
+ for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{}
+
+ if cur.this.doc.fname == fname {
+ cur.this.title = true;
+ cur.this.freq++;
+ } else if cur.next == nil {
+ tmp = &index{doc: doc, title: true, freq: 1};
+ cur.next = &wordList{this: tmp, next: nil};
+ } else {
+ panic(fmt.Sprintf("%v", cur));
+ }
+ }
+ }
+ }
+ fd.Close();
+ }
+ sorted = make([]wordSort, len(words));
+ i = 0;
+ for k,v := range words {
+ sorted[i].w = k;
+ sorted[i].root = v;
+ i++;
+ }
+
+ sort.Slice(sorted, func(i, j int) bool {
+ return sorted[i].w < sorted[j].w;
+ });
+
+ fd,_ = os.Create("index.dat");
+ printIndex(sorted, fd);
+ fd.Close();
+}
diff --git a/CSC2636/search/search.go b/CSC2636/search/search.go
new file mode 100644
index 0000000..c144055
--- /dev/null
+++ b/CSC2636/search/search.go
@@ -0,0 +1,144 @@
+/************************************************
+ * README *
+ * In order for search/index to be accessible *
+ * you must link this folder (search) into your *
+ * GOPATH *
+ ************************************************/
+
+
+package main
+
+import "search/index"
+import "os"
+import "fmt"
+import "sort"
+import "flag"
+import "strings"
+
+type res struct {
+ doc string;
+ score float64;
+};
+
+func main() {
+ var init_index, sIndex index.I_index;
+ var tmp, results, root *index.I_entry;
+ var tmp_score float64;
+ var scores map[string]map[string]float64; // scores[doc][word] == score
+ var i,j int;
+ var searchBool, perWord, docAdded map[string]bool; //map[doc]bool
+ var resultSort []res;
+ var err error;
+ var fname, s string;
+ var search []string;
+
+ flag.StringVar(&fname, "f", "./index.dat", "Index file");
+ flag.StringVar(&s, "s", "" , "Search phrase");
+
+ flag.Parse();
+ if len(s) == 0 {
+ fmt.Printf("Usage: search -s \"search phrase\" [-f index_file]");
+ os.Exit(1);
+ } else {
+ search = strings.Fields(s);
+ }
+
+ scores = make(map[string]map[string]float64);
+ searchBool = make(map[string]bool);
+ perWord = make(map[string]bool);
+ docAdded = make(map[string]bool);
+
+
+ sIndex = make(index.I_index);
+
+
+
+ init_index, err = index.NewInvertedIndexFromFile(fname);
+ if err != nil {
+ panic(err)
+ }
+ for i = 0; i < len(search); i++ {
+ sIndex[search[i]] = init_index[search[i]]
+ }
+
+ for _, v := range sIndex {
+ for tmp = v; tmp != nil; tmp = tmp.Next {
+ searchBool[tmp.This.Doc] = true;
+ scores[tmp.This.Doc] = make(map[string]float64);
+ }
+ }
+
+ for _, v := range sIndex {
+ for tmp = v; tmp != nil; tmp = tmp.Next {
+ perWord[tmp.This.Doc] = true;
+ }
+ for d := range searchBool {
+ if !perWord[d] {
+ searchBool[d] = false;
+ }
+ }
+ perWord = make(map[string]bool);
+ }
+
+ for k, v := range sIndex {
+ for tmp = v; tmp != nil; tmp = tmp.Next {
+ if searchBool[tmp.This.Doc] {
+ if tmp.This.In_title {
+ tmp_score = 1.0;
+ } else {
+ tmp_score = 0.0;
+ }
+
+ scores[tmp.This.Doc][k] = (0.9 * tmp.This.Freq) + (0.1 * tmp_score);
+ }
+ }
+
+ }
+
+ i = 0;
+ results = &index.I_entry{nil, nil}
+ root = &index.I_entry{nil, nil};
+ results.Next = root;
+
+ j = 0;
+
+ for _ ,v := range sIndex {
+ for tmp = v; tmp != nil; tmp = tmp.Next {
+ if (searchBool[tmp.This.Doc]) {
+ root.This = tmp.This;
+ docAdded[root.This.Doc] = false;
+ root.Next = &index.I_entry{nil, nil};
+ root = root.Next;
+ j++
+ }
+ }
+ }
+
+ resultSort = make([]res, j);
+
+ i = 0;
+ for root = results.Next; root.Next != nil; root = root.Next {
+ if (!docAdded[root.This.Doc]) {
+ j = 0;
+ tmp_score = 0;
+ for _ ,v := range scores[root.This.Doc] {
+ tmp_score += v;
+ j++;
+ }
+ tmp_score /= float64(j);
+ resultSort[i] = res{root.This.Doc, tmp_score};
+ docAdded[root.This.Doc] = true;
+ i++;
+ }
+ }
+ resultSort = resultSort[:i];
+
+ sort.Slice(resultSort, func(i, j int) bool {
+ return resultSort[i].score > resultSort[j].score;
+ });
+
+ fmt.Printf("Results: %d\n", len(resultSort));
+ for i = 0; i < len(resultSort); i++ {
+ fmt.Printf("\t%d. Doc: %s, Score: %.3f\n", i, resultSort[i].doc, resultSort[i].score);
+ }
+}
diff --git a/CSC2636/webCrawler2/README.rst b/CSC2636/webCrawler2/README.rst
new file mode 100644
index 0000000..1168fb9
--- /dev/null
+++ b/CSC2636/webCrawler2/README.rst
@@ -0,0 +1,13 @@
+===========
+Web Crawler
+===========
+
+Web crawler for Web Science class
+
+Dependencies
+============
+- `GoQuery <http://www.github.com/PuerkitoBio/goquery>`_.
+
+Authors
+=======
+- Tucker Evans
diff --git a/CSC2636/webCrawler2/crawler.go b/CSC2636/webCrawler2/crawler.go
new file mode 100644
index 0000000..5c4dba6
--- /dev/null
+++ b/CSC2636/webCrawler2/crawler.go
@@ -0,0 +1,164 @@
+package main
+
+import "crypto/md5"
+import "fmt"
+import "github.com/PuerkitoBio/goquery"
+import "log"
+import "net/url"
+import "os"
+import "strconv"
+import "strings"
+import "sync"
+import "sync/atomic"
+import "time"
+
+type link struct {
+ u *url.URL
+ depth int
+}
+
+var mutex *sync.Mutex
+var Prev map[string]bool
+var base string
+var links_visited uint64 = 0
+
+func validLink(s string) bool {
+ return true
+ //return (strings.HasSuffix(s, ".html") || strings.HasSuffix(s, "/") || strings.HasSuffix(s, "\\"))
+}
+
+func addLinks(doc *goquery.Document, jobs chan link, current link, depth int, worker_id int) {
+ doc.Find("body a").Each(func(index int, item *goquery.Selection) {
+ link_s, _ := item.Attr("href")
+
+ d := depth + 1
+
+ u, err := url.Parse(link_s)
+ if err != nil {
+ panic(err)
+ }
+
+ if !u.IsAbs() {
+ u = current.u.ResolveReference(u)
+ }
+ if strings.Contains(u.String(), base) && validLink(u.String()) {
+ mutex.Lock()
+ if !Prev[u.String()] {
+ jobs <- link{u, d}
+ Prev[u.String()] = true
+ }
+ mutex.Unlock()
+ }
+ })
+}
+
+func consume(doc *goquery.Document, url link, worker_id int) {
+ f, _ := os.Create(fmt.Sprintf("./pages/%x", md5.Sum([]byte(url.u.String()))))
+ s, _ := doc.Html()
+ f.Write([]byte(s))
+}
+
+func worker(done chan bool, jobs chan link, depth int, id int, total uint64) {
+ for {
+ x := atomic.LoadUint64(&links_visited)
+ if x >= total {
+ done <- true
+ return
+ }
+
+ atomic.AddUint64(&links_visited, 1)
+ select {
+ case j := <-jobs:
+ if j.depth < depth {
+ doc, err := goquery.NewDocument(j.u.String())
+ if err != nil {
+ log.Print("Error Reading Document: " + j.u.String() + err.Error())
+ break
+ }
+
+ fmt.Printf("worker %d Working on %s...\n", id, j.u.String())
+
+ consume(doc, j, id)
+ addLinks(doc, jobs, j, j.depth, id)
+ }
+ case <-time.After(time.Second * 10):
+ fmt.Printf("Worker %d done\n", id)
+ done <- true
+ return
+ }
+ }
+}
+
+func init() {
+ mutex = &sync.Mutex{}
+ Prev = make(map[string]bool)
+ var err error
+
+ fi, err := os.Lstat("./pages");
+ if err != nil {
+ fmt.Printf("INIT ERROR: %s\n", err);
+ }
+
+ if (fi == nil) {
+ os.Mkdir("./pages", 0755);
+ } else if (fi.Mode().IsRegular()) {
+ panic("pages is not a valid directory\n")
+ }
+
+}
+
+func main() {
+ var d, w, b int
+ var t uint64
+
+ if len(os.Args) < 5 {
+ fmt.Printf("usage: crawler url depth max_links workers\n")
+ panic("test")
+ }
+
+ base = strings.TrimPrefix(os.Args[1], "http://www.")
+ base = strings.TrimPrefix(base, "https://www.")
+ if base == os.Args[1] {
+ panic(base)
+ }
+
+ d, _ = strconv.Atoi(os.Args[2])
+ b, _ = (strconv.Atoi(os.Args[3]))
+ t = uint64(b)
+ b, _ = (strconv.Atoi(os.Args[3]))
+ t = uint64(b)
+ w, _ = strconv.Atoi(os.Args[4])
+
+ jobs := make(chan link, 1024*1024)
+ done := make(chan bool)
+
+ u, err := url.Parse(os.Args[1])
+ if err != nil {
+ panic(err)
+ }
+
+ if !u.IsAbs() {
+ panic("Cannot start with relative url")
+ }
+ jobs <- link{u, 0}
+
+ //send first job
+
+ for i := 0; i < w; i++ {
+ go worker(done, jobs, d, i, t)
+ }
+
+ for i := 0; i < w; {
+ select {
+ case <-done:
+ i++
+ case <-time.After(1 * time.Second):
+ if len(jobs) == (1024 * 1024) {
+ i = w
+ }
+ }
+ }
+
+ close(done)
+ close(jobs)
+}