From e8b1808eaf87a49e4c34ebbfb66854baa627418c Mon Sep 17 00:00:00 2001 From: Tucker Evans Date: Mon, 18 Feb 2019 07:35:54 -0500 Subject: Moves assignments to given course folder. --- CSC2636/search/.gitignore | 6 + CSC2636/search/README.rst | 19 ++ CSC2636/search/assign.rst | 213 ++++++++++++++++++++++ CSC2636/search/index/index.go | 165 +++++++++++++++++ CSC2636/search/indexer.go | 402 +++++++++++++++++++++++++++++++++++++++++ CSC2636/search/search.go | 144 +++++++++++++++ CSC2636/webCrawler2/README.rst | 13 ++ CSC2636/webCrawler2/crawler.go | 164 +++++++++++++++++ 8 files changed, 1126 insertions(+) create mode 100644 CSC2636/search/.gitignore create mode 100644 CSC2636/search/README.rst create mode 100644 CSC2636/search/assign.rst create mode 100644 CSC2636/search/index/index.go create mode 100644 CSC2636/search/indexer.go create mode 100644 CSC2636/search/search.go create mode 100644 CSC2636/webCrawler2/README.rst create mode 100644 CSC2636/webCrawler2/crawler.go (limited to 'CSC2636') diff --git a/CSC2636/search/.gitignore b/CSC2636/search/.gitignore new file mode 100644 index 0000000..7523492 --- /dev/null +++ b/CSC2636/search/.gitignore @@ -0,0 +1,6 @@ +*test* +pages +index.dat +indexer +search +*.swp diff --git a/CSC2636/search/README.rst b/CSC2636/search/README.rst new file mode 100644 index 0000000..e1d14fb --- /dev/null +++ b/CSC2636/search/README.rst @@ -0,0 +1,19 @@ +============= +Search Engine +============= + +Setup +===== +In order for search.go to use the index package the directory "index" +must by copied (or linked) into a directory "search" that is in your +GOPATH. + +About +===== +Search Engine for web science class. + +See assign.rst for assignment details. + +Authors +======= +- Tucker Evans diff --git a/CSC2636/search/assign.rst b/CSC2636/search/assign.rst new file mode 100644 index 0000000..66e537e --- /dev/null +++ b/CSC2636/search/assign.rst @@ -0,0 +1,213 @@ +======================== +Project 2: Search Engine +======================== + +**CS2621– Web Science** + +*100 points* + +You are to create a web search engine that works at the command line. +To do this, you will write two Python scripts, indexer.py and +search.py. + +Indexer +======= + +Indexer.py should do the following: + +1. After performing a crawl (using your other Python script), read all + the HTML files that were stored in the “pages” directory. For each + document, extract the title and the text from the body of the page + (read the Beautiful Soup documentation to find out how). Beautiful + Soup will include the text of the page in the content of the page, + and that is OK. Beautiful Soup may also break on some pages and + include HTML as text, but we will not worry about these + exceptions or bugs. + +2. All text should be converted to lowercase and non-alphanumeric + characters should be ignored. So “123-456” would become “123” and + “456”, and “joe@yahoo.com” would become “joe”, “yahoo”, “com”. + Ignore the following stop words: a, an, and, are, as, at, be, by, + for, from, has, he, in, is, it, its, of, on, that, the, to, was, + were, will, with. Do not perform stemming. + +3. A single inverted index should be created for the document corpus + which maintains the document ID (numbered 1…n in order of the pages + found in the “pages” directory), a 1 or 0 if the text is found in + the title, and the term frequency from the body (normalized by the + total number of tokens in the document after removing stop words). + +4. After indexer.py has finished indexing all the web pages, it should + output the index to index.dat which looks likethis: + +:: + + arkansas + 6 0 0.022 + model + 1 0 0.309 + 3 0 0.015 + 5 1 0.001 + tuesday + 2 0 0.082 + white + 2 1 0.018 + etc… + +.. note :: + The indexed words are alphabetized, and there are 3 spaces before + sets of three numbers (each separated by a single space) which are: + doc ID, title (0 or 1), and normalized body TF (rounded to 3 decimal + places). For example, the term white was found only in document 2; + it was somewhere in the title and made up 1.8% of all the words in + the document. + +5. It may take some time for your program to run, so you should output + information about the program’s status as it indexes the crawled + pages. Outputting what file is being worked on would be helpful to + the user who is waiting for the program to finish its work. + +Search +====== + +After the index is written to index.dat, the search.py script will +allow the user to search the corpus for specific words. Here is how +it should operate: + +1. First, read the search phrase at the command line. Examples: + + .. code :: bash + + $ search.py bisons + $ search.py "landmark college" + +If no command line argument is supplied, the program should tell the +user a search term is required and terminate. Ignore any command-line +arguments after the first. + +2. Next, the program should read the index from index.dat into memory. + Note that you may want to use similar data structures used in + indexer.py, so you should write your programs in a way where you + share code without having redundant code in each script. (It’s OK + to introduce new .py files to your project.) + +3. For simplicity, all queries will be assumed to use boolean ANDs, + and we will not implement phrase search. For example, the query + landmark college should generate a boolean search for landmark AND + college, so only documents containing both terms should be + considered amatch. + +4. Remove any stop words from the query as was done when indexing the + documents. + +5. After determining which documents match the search terms, calculate + the relevancy score for each document: relevancy score = 0.9 * body + TF + 0.1 * title score Do this for each term, and compute the + average relevancy score for all terms. So if the search was for + landmark college, you would compute the score for landmark and the + score for college and compute the average to determine the overall + relevancy score. + +6. The total number of results should first be displayed. Then display + every document ID and score (out to 3 decimal places) ordered by + score, and number the results. Example: + +.. code:: bash + + Results: 4 + 1. docID, 3, score, 0.830 + 2. docID, 1, score, 0.814 + 3. docID, 5, score, 0.350 + 4. docID, 8, score, 0.108 + +**Bonus:** You can receive 5 bonus points by implementing phrase search. +So when the user searches for “landmark college”, assume they want +only documents with that exact phrase. To accomplish this, you will +need to store the positions of the terms that are stored in the +inverted index. Then use those positions to ensure the phrase matches +successive positions. + + +Zip your entire project directory and submit it +to Canvas before it is due. Make sure your output matches the +specifications precisely to avoid losing any points. If you use any +code you find in the Web, you must document the source in your +program. + +Test Data +========= + +*a.html* + +.. code:: html + + cool!!! test!!! + + this 123-456. + + +*b.html* + +.. code:: html + + + + Go Patriots! + + + And another test and test! + + + +*c.html* + +.. code:: html + + + This is a test. + + +*Inverted index:* + +.. code:: + + 123 + a 0 0.200 + 456 + a 0 0.200 + another + b 0 0.200 + cool + a 1 0.200 + patriots + b 1 0.200 + go + b 1 0.200 + test + a 1 0.200 + c 0 0.500 + b 0 0.400 + this + a 0 0.200 + c 0 0.500 + +Search for "test this" results in the following: + +:: + + Results: 2 + 1. docID c, score 0.450 + 2. docID a, score 0.230 + +Search for "test patriots go" results in the following: + +:: + + Results: 1 + 1. docID b, score 0.310 + +Search for "cool patriots" results in the following: + +:: + + Results: 0 diff --git a/CSC2636/search/index/index.go b/CSC2636/search/index/index.go new file mode 100644 index 0000000..5d8ab65 --- /dev/null +++ b/CSC2636/search/index/index.go @@ -0,0 +1,165 @@ +package index + +import "fmt" +import "os" +import "io" +import "bufio" +import "sort" +import "errors" +import "strings" +import "strconv" + +/* TODO + + - Implement Forward Creation + - Implement Inverted from Forward + - Switch Indexer.go over to this package + +/********* + * Types * + *********/ + +type F_info struct { + Word string; + In_title bool; + Freq float64; +}; + +type I_info struct { + Doc string; + In_title bool; + Freq float64; +}; + +type F_entry struct{ + This *F_info; + Next *F_entry; +}; + +type I_entry struct{ + This *I_info; + Next *I_entry; +}; + +type F_index map[string]*F_entry; +type I_index map[string]*I_entry; + +type sortInverted struct{ + w string; + root *I_entry; +}; + + +/*************************** + * Forward Index Funcitons * + ***************************/ + +func NewForwardEntryStrings(text, title []string) (*F_entry, error) { + return nil, errors.New("not implemented"); +} + +/**************************** + * Inverted Index Functions * + ****************************/ + +func new_I_info() *I_info{ + return &I_info{"", false, 0.0}; +} + +func NewInvertedIndexFromFile(fname string) (I_index, error) { + var fd *os.File; + var br *bufio.Reader; + var err error; + var buf []byte; + var tmp *I_info; + var cur *I_entry; + var index I_index; + var word string + var info []string; + + fd, err = os.Open(fname); + if err != nil { + return nil, err; + } + + br = bufio.NewReader(fd); + if br == nil { + return nil, errors.New("Could not initialize reader"); + } + + index = make(I_index); + + for buf, err = br.ReadBytes('\n'); err != io.EOF; buf, err = br.ReadBytes('\n'){ + tmp = new_I_info(); + if err != nil { + return nil, err; + } + if buf[0] != '\t' { + word = strings.TrimSpace(string(buf)); + } else { + info = strings.Fields(string(buf)); + tmp.Doc = info[0]; + tmp.In_title = (info[1] == "1"); + tmp.Freq, _ = strconv.ParseFloat(info[2], 32); + if (index[word] == nil) { + index[word] = &I_entry{This: tmp, Next: nil}; + } else { + cur = index[word]; + for cur.Next != nil { + cur = cur.Next; + } + cur.Next = &I_entry{This: tmp, Next: nil}; + } + } + } + + return index, nil; +} + +func NewInvertedFromForward(f F_index) (I_index, error) { + return nil, errors.New("not implemented"); + +} + +func (x I_index) PrintToFile(fd *os.File) error{ + var i int; + var cur *I_entry; + var index []sortInverted; + + index = x.sortIndex(); + + for i = 0; i < len(index); i++ { + fmt.Fprintf(fd, "%s\n", index[i].w); + for cur = index[i].root; cur != nil; cur = cur.Next { + fmt.Fprintf(fd, "\t%s %d %.3f\n", cur.This.Doc, toInt(cur.This.In_title), cur.This.Freq); + } + } + return nil; +} + +func toInt(t bool) int{ + if (t){ + return 1; + } + return 0; +} + +func (unsort I_index) sortIndex() []sortInverted { + var i int; + var sorted []sortInverted; + + sorted = make([]sortInverted, len(unsort)); + + i = 0; + for k, v := range unsort { + sorted[i].w = k; + sorted[i].root = v; + i++; + } + + sort.Slice(sorted, func(i, j int) bool { + return sorted[i].w < sorted[j].w; + }); + + return sorted +} diff --git a/CSC2636/search/indexer.go b/CSC2636/search/indexer.go new file mode 100644 index 0000000..d95f126 --- /dev/null +++ b/CSC2636/search/indexer.go @@ -0,0 +1,402 @@ +package main + +import "os" +import "sort" +import "golang.org/x/net/html" +import "log" +import "fmt" +import "github.com/PuerkitoBio/goquery" +import "github.com/kennygrant/sanitize" +import "strings" +import "flag" +import "errors" +import "regexp" + +type document struct { + fname string; + title []string; + text []string; + length int; +} + +type index struct { + doc *document; + title bool; + freq int; +} + +type wordSort struct { + w string; + root *wordList; +} + +type wordList struct { + this *index + next *wordList +} + +var r, nonAN *regexp.Regexp; +var stopWords []*regexp.Regexp; + + +func newDocument() *document { + return &document{"" , nil, nil, 0}; +} + +func RemoveNode(r, rn *html.Node) { + var found bool; + var n, item *html.Node; + var nodes map[int]*html.Node; + var i, j int; + + found = false; + nodes = make(map[int]*html.Node); + + for n = r.FirstChild; n != nil; n = n.NextSibling { + if n == rn { + found = true; + n.Parent.RemoveChild(n); + } + + nodes[i] = n; + i++; + } + + if !found { + for j = 0; j < i; j++ { + item = nodes[j]; + RemoveNode(item, rn); + } + } +} +func RemoveTag(doc *goquery.Selection, tag string) { + doc.Find(tag).Each(func(i int, s *goquery.Selection) { + RemoveNode(doc.Get(0), s.Get(0)); + }); +} + +func logReg(h []byte) []byte { + log.Printf("RegExp: %s", h); + return h; +} + +func parseDoc(fd *os.File, f_info os.FileInfo) (*document, error) { + var err error; + var text, t_text string; + var doc *goquery.Document; + var body, title *goquery.Selection; + var r_doc *document; + var i int; + + doc, err = goquery.NewDocumentFromReader(fd); + if err != nil { + log.Printf("goquery error: %s\n", err); + return nil, errors.New("Can't create goquery documnt"); + } + + body = doc.Find("body"); + RemoveTag(body, "script"); + RemoveTag(body, "noscript"); + + title = doc.Find("title"); + + //TODO add error detection + text, err = body.Html(); + t_text, err = title.Html(); + + + text = r.ReplaceAllString(text, "> <"); + t_text = r.ReplaceAllString(t_text, "> <"); + + text = sanitize.HTML(text); + t_text = sanitize.HTML(t_text); + + text = strings.ToLower(text); + t_text = strings.ToLower(t_text); + + text = nonAN.ReplaceAllString(text, " "); + t_text = nonAN.ReplaceAllString(t_text, " "); + + + for i = 0; i < len(stopWords); i++ { + text = stopWords[i].ReplaceAllString(text, " "); + t_text = stopWords[i].ReplaceAllString(t_text, " "); + } + r_doc = newDocument(); + + r_doc.fname = f_info.Name(); + r_doc.text = strings.Fields(text); + r_doc.title = strings.Fields(t_text); + r_doc.length = len(r_doc.text) + len(r_doc.title); + + return r_doc, nil; +} +func boolToInt(t bool) int { + if t { + return 1; + } + return 0; +} + +func printIndex(words []wordSort, fd *os.File) { + var i int; + var cur *wordList; + var fname string; + var t int; + var freq float64; + + for i = 0; i < len(words); i++ { + fmt.Fprintf(fd, "%s\n", words[i].w); + for cur = words[i].root; cur != nil; cur = cur.next { + fname = cur.this.doc.fname; + t = boolToInt(cur.this.title); + freq = float64(cur.this.freq) / float64(cur.this.doc.length); + + fmt.Fprintf(fd,"\t%s %d %.3f\n", fname, t, freq); + } + } +} + +func init() { + var err error; + log.SetOutput(os.Stderr); + r, err = regexp.Compile("><"); + if err != nil { + panic(err); + } + nonAN, err = regexp.Compile("[^a-zA-Z0-9]+"); + if err != nil { + panic(err); + } + //TODO add func to read in stop words from a file; + stopWords = make([]*regexp.Regexp, 26) + if err != nil { + panic(err); + } + stopWords[0], err = regexp.Compile("\\W+and\\W+"); + if err != nil { + panic(err); + } + stopWords[1], err = regexp.Compile("\\W+a\\W+"); + if err != nil { + panic(err); + } + stopWords[2], err = regexp.Compile("\\W+an\\W+"); + if err != nil { + panic(err); + } + stopWords[3], err = regexp.Compile("\\W+and\\W+"); + if err != nil { + panic(err); + } + stopWords[4], err = regexp.Compile("\\W+are\\W+"); + if err != nil { + panic(err); + } + stopWords[5], err = regexp.Compile("\\W+as\\W+"); + if err != nil { + panic(err); + } + stopWords[6], err = regexp.Compile("\\W+at\\W+"); + if err != nil { + panic(err); + } + stopWords[7], err = regexp.Compile("\\W+be\\W+"); + if err != nil { + panic(err); + } + stopWords[8], err = regexp.Compile("\\W+by\\W+"); + if err != nil { + panic(err); + } + stopWords[9], err = regexp.Compile("\\W+for\\W+"); + if err != nil { + panic(err); + } + stopWords[10], err = regexp.Compile("\\W+from\\W+"); + if err != nil { + panic(err); + } + stopWords[11], err = regexp.Compile("\\W+has\\W+"); + if err != nil { + panic(err); + } + stopWords[12], err = regexp.Compile("\\W+he\\W+"); + if err != nil { + panic(err); + } + stopWords[13], err = regexp.Compile("\\W+in\\W+"); + if err != nil { + panic(err); + } + stopWords[14], err = regexp.Compile("\\W+is\\W+"); + if err != nil { + panic(err); + } + stopWords[15], err = regexp.Compile("\\W+it\\W+"); + if err != nil { + panic(err); + } + stopWords[16], err = regexp.Compile("\\W+its\\W+"); + if err != nil { + panic(err); + } + stopWords[17], err = regexp.Compile("\\W+of\\W+"); + if err != nil { + panic(err); + } + stopWords[18], err = regexp.Compile("\\W+on\\W+"); + if err != nil { + panic(err); + } + stopWords[19], err = regexp.Compile("\\W+that\\W+"); + if err != nil { + panic(err); + } + stopWords[20], err = regexp.Compile("\\W+the\\W+"); + if err != nil { + panic(err); + } + stopWords[21], err = regexp.Compile("\\W+to\\W+"); + if err != nil { + panic(err); + } + stopWords[22], err = regexp.Compile("\\W+was\\W+"); + if err != nil { + panic(err); + } + stopWords[23], err = regexp.Compile("\\W+were\\W+"); + if err != nil { + panic(err); + } + stopWords[24], err = regexp.Compile("\\W+will\\W+"); + if err != nil { + panic(err); + } + stopWords[25], err = regexp.Compile("\\W+with\\W+"); + if err != nil { + panic(err); + } +} + +func main() { + // var words map[string]index + var p_dir, w, fname string; + var err error; + var i, j int; + var words map[string]*wordList; + var cur *wordList; + var tmp *index; + var sorted []wordSort; + + var files []os.FileInfo; + var dir, fd *os.File; + var dir_info, fd_info os.FileInfo; + var dir_mode os.FileMode; + + var doc *document; + + flag.StringVar(&p_dir, "d", "./pages", "pages directory"); + + flag.Parse(); + + words = make(map[string]*wordList); + + dir, err = os.Open(p_dir); + if err != nil { + log.Printf("Error accessing \"%s\":\t%s\n", p_dir, err); + os.Exit(1); + } + + dir_info, err = dir.Stat(); + dir_mode = dir_info.Mode(); + + if !dir_mode.IsDir() { + log.Printf("\"%s\" is not a directory\n", p_dir); + os.Exit(1); + } + + files, err = dir.Readdir(0); + if err != nil { + log.Printf("Error reading %s\n", p_dir); + os.Exit(1); + } + + for i = 0; i < len(files); i++ { + fd, err = os.Open(fmt.Sprintf("%s/%s", dir_info.Name(), files[i].Name())); + fd_info, err = fd.Stat(); + if err != nil { + log.Printf("Error getting info\n"); + os.Exit(1); + } + fname = fd_info.Name(); + + if err != nil { + log.Printf("Error reading %s/%s\n", dir_info.Name(), files[i].Name()); + } else { + fmt.Printf("Indexing %s...\n", fname); + doc, err = parseDoc(fd, fd_info); + if err != nil { + log.Printf("Error parsing %s/%s\n", dir_info.Name(), files[i].Name()); + } else { + /* Text */ + for j = 0; j < len(doc.text); j++ { + w = strings.ToLower(doc.text[j]); + + if words[w] == nil{ + tmp = &index{doc: doc, title: false, freq: 0}; + words[w] = &wordList{this: tmp, next: nil}; + } + + for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{} + + if cur.this.doc.fname == fname { + cur.this.freq++ + } else if cur.next == nil { + tmp = &index{doc: doc, title: false, freq: 1}; + cur.next = &wordList{this: tmp, next: nil}; + } else { + panic(fmt.Sprintf("%v", cur)); + } + } + /* Title */ + for j = 0; j < len(doc.title); j++ { + w = strings.ToLower(doc.title[j]); + + if words[w] == nil{ + tmp = &index{doc: doc, title: true, freq: 0}; + words[w] = &wordList{this: tmp, next: nil}; + } + + for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{} + + if cur.this.doc.fname == fname { + cur.this.title = true; + cur.this.freq++; + } else if cur.next == nil { + tmp = &index{doc: doc, title: true, freq: 1}; + cur.next = &wordList{this: tmp, next: nil}; + } else { + panic(fmt.Sprintf("%v", cur)); + } + } + } + } + fd.Close(); + } + sorted = make([]wordSort, len(words)); + i = 0; + for k,v := range words { + sorted[i].w = k; + sorted[i].root = v; + i++; + } + + sort.Slice(sorted, func(i, j int) bool { + return sorted[i].w < sorted[j].w; + }); + + fd,_ = os.Create("index.dat"); + printIndex(sorted, fd); + fd.Close(); +} diff --git a/CSC2636/search/search.go b/CSC2636/search/search.go new file mode 100644 index 0000000..c144055 --- /dev/null +++ b/CSC2636/search/search.go @@ -0,0 +1,144 @@ +/************************************************ + * README * + * In order for search/index to be accessible * + * you must link this folder (search) into your * + * GOPATH * + ************************************************/ + + +package main + +import "search/index" +import "os" +import "fmt" +import "sort" +import "flag" +import "strings" + +type res struct { + doc string; + score float64; +}; + +func main() { + var init_index, sIndex index.I_index; + var tmp, results, root *index.I_entry; + var tmp_score float64; + var scores map[string]map[string]float64; // scores[doc][word] == score + var i,j int; + var searchBool, perWord, docAdded map[string]bool; //map[doc]bool + var resultSort []res; + var err error; + var fname, s string; + var search []string; + + flag.StringVar(&fname, "f", "./index.dat", "Index file"); + flag.StringVar(&s, "s", "" , "Search phrase"); + + flag.Parse(); + if len(s) == 0 { + fmt.Printf("Usage: search -s \"search phrase\" [-f index_file]"); + os.Exit(1); + } else { + search = strings.Fields(s); + } + + scores = make(map[string]map[string]float64); + searchBool = make(map[string]bool); + perWord = make(map[string]bool); + docAdded = make(map[string]bool); + + + sIndex = make(index.I_index); + + + + init_index, err = index.NewInvertedIndexFromFile(fname); + if err != nil { + panic(err) + } + for i = 0; i < len(search); i++ { + sIndex[search[i]] = init_index[search[i]] + } + + for _, v := range sIndex { + for tmp = v; tmp != nil; tmp = tmp.Next { + searchBool[tmp.This.Doc] = true; + scores[tmp.This.Doc] = make(map[string]float64); + } + } + + for _, v := range sIndex { + for tmp = v; tmp != nil; tmp = tmp.Next { + perWord[tmp.This.Doc] = true; + } + for d := range searchBool { + if !perWord[d] { + searchBool[d] = false; + } + } + perWord = make(map[string]bool); + } + + for k, v := range sIndex { + for tmp = v; tmp != nil; tmp = tmp.Next { + if searchBool[tmp.This.Doc] { + if tmp.This.In_title { + tmp_score = 1.0; + } else { + tmp_score = 0.0; + } + + scores[tmp.This.Doc][k] = (0.9 * tmp.This.Freq) + (0.1 * tmp_score); + } + } + + } + + i = 0; + results = &index.I_entry{nil, nil} + root = &index.I_entry{nil, nil}; + results.Next = root; + + j = 0; + + for _ ,v := range sIndex { + for tmp = v; tmp != nil; tmp = tmp.Next { + if (searchBool[tmp.This.Doc]) { + root.This = tmp.This; + docAdded[root.This.Doc] = false; + root.Next = &index.I_entry{nil, nil}; + root = root.Next; + j++ + } + } + } + + resultSort = make([]res, j); + + i = 0; + for root = results.Next; root.Next != nil; root = root.Next { + if (!docAdded[root.This.Doc]) { + j = 0; + tmp_score = 0; + for _ ,v := range scores[root.This.Doc] { + tmp_score += v; + j++; + } + tmp_score /= float64(j); + resultSort[i] = res{root.This.Doc, tmp_score}; + docAdded[root.This.Doc] = true; + i++; + } + } + resultSort = resultSort[:i]; + + sort.Slice(resultSort, func(i, j int) bool { + return resultSort[i].score > resultSort[j].score; + }); + + fmt.Printf("Results: %d\n", len(resultSort)); + for i = 0; i < len(resultSort); i++ { + fmt.Printf("\t%d. Doc: %s, Score: %.3f\n", i, resultSort[i].doc, resultSort[i].score); + } +} diff --git a/CSC2636/webCrawler2/README.rst b/CSC2636/webCrawler2/README.rst new file mode 100644 index 0000000..1168fb9 --- /dev/null +++ b/CSC2636/webCrawler2/README.rst @@ -0,0 +1,13 @@ +=========== +Web Crawler +=========== + +Web crawler for Web Science class + +Dependencies +============ +- `GoQuery `_. + +Authors +======= +- Tucker Evans diff --git a/CSC2636/webCrawler2/crawler.go b/CSC2636/webCrawler2/crawler.go new file mode 100644 index 0000000..5c4dba6 --- /dev/null +++ b/CSC2636/webCrawler2/crawler.go @@ -0,0 +1,164 @@ +package main + +import "crypto/md5" +import "fmt" +import "github.com/PuerkitoBio/goquery" +import "log" +import "net/url" +import "os" +import "strconv" +import "strings" +import "sync" +import "sync/atomic" +import "time" + +type link struct { + u *url.URL + depth int +} + +var mutex *sync.Mutex +var Prev map[string]bool +var base string +var links_visited uint64 = 0 + +func validLink(s string) bool { + return true + //return (strings.HasSuffix(s, ".html") || strings.HasSuffix(s, "/") || strings.HasSuffix(s, "\\")) +} + +func addLinks(doc *goquery.Document, jobs chan link, current link, depth int, worker_id int) { + doc.Find("body a").Each(func(index int, item *goquery.Selection) { + link_s, _ := item.Attr("href") + + d := depth + 1 + + u, err := url.Parse(link_s) + if err != nil { + panic(err) + } + + if !u.IsAbs() { + u = current.u.ResolveReference(u) + } + if strings.Contains(u.String(), base) && validLink(u.String()) { + mutex.Lock() + if !Prev[u.String()] { + jobs <- link{u, d} + Prev[u.String()] = true + } + mutex.Unlock() + } + }) +} + +func consume(doc *goquery.Document, url link, worker_id int) { + f, _ := os.Create(fmt.Sprintf("./pages/%x", md5.Sum([]byte(url.u.String())))) + s, _ := doc.Html() + f.Write([]byte(s)) +} + +func worker(done chan bool, jobs chan link, depth int, id int, total uint64) { + for { + x := atomic.LoadUint64(&links_visited) + if x >= total { + done <- true + return + } + + atomic.AddUint64(&links_visited, 1) + select { + case j := <-jobs: + if j.depth < depth { + doc, err := goquery.NewDocument(j.u.String()) + if err != nil { + log.Print("Error Reading Document: " + j.u.String() + err.Error()) + break + } + + fmt.Printf("worker %d Working on %s...\n", id, j.u.String()) + + consume(doc, j, id) + addLinks(doc, jobs, j, j.depth, id) + } + case <-time.After(time.Second * 10): + fmt.Printf("Worker %d done\n", id) + done <- true + return + } + } +} + +func init() { + mutex = &sync.Mutex{} + Prev = make(map[string]bool) + var err error + + fi, err := os.Lstat("./pages"); + if err != nil { + fmt.Printf("INIT ERROR: %s\n", err); + } + + if (fi == nil) { + os.Mkdir("./pages", 0755); + } else if (fi.Mode().IsRegular()) { + panic("pages is not a valid directory\n") + } + +} + +func main() { + var d, w, b int + var t uint64 + + if len(os.Args) < 5 { + fmt.Printf("usage: crawler url depth max_links workers\n") + panic("test") + } + + base = strings.TrimPrefix(os.Args[1], "http://www.") + base = strings.TrimPrefix(base, "https://www.") + if base == os.Args[1] { + panic(base) + } + + d, _ = strconv.Atoi(os.Args[2]) + b, _ = (strconv.Atoi(os.Args[3])) + t = uint64(b) + b, _ = (strconv.Atoi(os.Args[3])) + t = uint64(b) + w, _ = strconv.Atoi(os.Args[4]) + + jobs := make(chan link, 1024*1024) + done := make(chan bool) + + u, err := url.Parse(os.Args[1]) + if err != nil { + panic(err) + } + + if !u.IsAbs() { + panic("Cannot start with relative url") + } + jobs <- link{u, 0} + + //send first job + + for i := 0; i < w; i++ { + go worker(done, jobs, d, i, t) + } + + for i := 0; i < w; { + select { + case <-done: + i++ + case <-time.After(1 * time.Second): + if len(jobs) == (1024 * 1024) { + i = w + } + } + } + + close(done) + close(jobs) +} -- cgit v1.1