diff options
author | Tucker Evans <tuckerevans24@gmail.com> | 2019-02-18 07:35:54 -0500 |
---|---|---|
committer | Tucker Evans <tuckerevans24@gmail.com> | 2019-02-18 07:35:54 -0500 |
commit | e8b1808eaf87a49e4c34ebbfb66854baa627418c (patch) | |
tree | 8a4bb15321992702b6b26e34bd2ed3a55bb7b0d9 /webCrawler2 | |
parent | 6cc5652a8af3361288393718ec2adb2889c9af1e (diff) |
Moves assignments to given course folder.
Diffstat (limited to 'webCrawler2')
-rw-r--r-- | webCrawler2/README.rst | 13 | ||||
-rw-r--r-- | webCrawler2/crawler.go | 164 |
2 files changed, 0 insertions, 177 deletions
diff --git a/webCrawler2/README.rst b/webCrawler2/README.rst deleted file mode 100644 index 1168fb9..0000000 --- a/webCrawler2/README.rst +++ /dev/null @@ -1,13 +0,0 @@ -=========== -Web Crawler -=========== - -Web crawler for Web Science class - -Dependencies -============ -- `GoQuery <http://www.github.com/PuerkitoBio/goquery>`_. - -Authors -======= -- Tucker Evans diff --git a/webCrawler2/crawler.go b/webCrawler2/crawler.go deleted file mode 100644 index 5c4dba6..0000000 --- a/webCrawler2/crawler.go +++ /dev/null @@ -1,164 +0,0 @@ -package main - -import "crypto/md5" -import "fmt" -import "github.com/PuerkitoBio/goquery" -import "log" -import "net/url" -import "os" -import "strconv" -import "strings" -import "sync" -import "sync/atomic" -import "time" - -type link struct { - u *url.URL - depth int -} - -var mutex *sync.Mutex -var Prev map[string]bool -var base string -var links_visited uint64 = 0 - -func validLink(s string) bool { - return true - //return (strings.HasSuffix(s, ".html") || strings.HasSuffix(s, "/") || strings.HasSuffix(s, "\\")) -} - -func addLinks(doc *goquery.Document, jobs chan link, current link, depth int, worker_id int) { - doc.Find("body a").Each(func(index int, item *goquery.Selection) { - link_s, _ := item.Attr("href") - - d := depth + 1 - - u, err := url.Parse(link_s) - if err != nil { - panic(err) - } - - if !u.IsAbs() { - u = current.u.ResolveReference(u) - } - if strings.Contains(u.String(), base) && validLink(u.String()) { - mutex.Lock() - if !Prev[u.String()] { - jobs <- link{u, d} - Prev[u.String()] = true - } - mutex.Unlock() - } - }) -} - -func consume(doc *goquery.Document, url link, worker_id int) { - f, _ := os.Create(fmt.Sprintf("./pages/%x", md5.Sum([]byte(url.u.String())))) - s, _ := doc.Html() - f.Write([]byte(s)) -} - -func worker(done chan bool, jobs chan link, depth int, id int, total uint64) { - for { - x := atomic.LoadUint64(&links_visited) - if x >= total { - done <- true - return - } - - atomic.AddUint64(&links_visited, 1) - select { - case j := <-jobs: - if j.depth < depth { - doc, err := goquery.NewDocument(j.u.String()) - if err != nil { - log.Print("Error Reading Document: " + j.u.String() + err.Error()) - break - } - - fmt.Printf("worker %d Working on %s...\n", id, j.u.String()) - - consume(doc, j, id) - addLinks(doc, jobs, j, j.depth, id) - } - case <-time.After(time.Second * 10): - fmt.Printf("Worker %d done\n", id) - done <- true - return - } - } -} - -func init() { - mutex = &sync.Mutex{} - Prev = make(map[string]bool) - var err error - - fi, err := os.Lstat("./pages"); - if err != nil { - fmt.Printf("INIT ERROR: %s\n", err); - } - - if (fi == nil) { - os.Mkdir("./pages", 0755); - } else if (fi.Mode().IsRegular()) { - panic("pages is not a valid directory\n") - } - -} - -func main() { - var d, w, b int - var t uint64 - - if len(os.Args) < 5 { - fmt.Printf("usage: crawler url depth max_links workers\n") - panic("test") - } - - base = strings.TrimPrefix(os.Args[1], "http://www.") - base = strings.TrimPrefix(base, "https://www.") - if base == os.Args[1] { - panic(base) - } - - d, _ = strconv.Atoi(os.Args[2]) - b, _ = (strconv.Atoi(os.Args[3])) - t = uint64(b) - b, _ = (strconv.Atoi(os.Args[3])) - t = uint64(b) - w, _ = strconv.Atoi(os.Args[4]) - - jobs := make(chan link, 1024*1024) - done := make(chan bool) - - u, err := url.Parse(os.Args[1]) - if err != nil { - panic(err) - } - - if !u.IsAbs() { - panic("Cannot start with relative url") - } - jobs <- link{u, 0} - - //send first job - - for i := 0; i < w; i++ { - go worker(done, jobs, d, i, t) - } - - for i := 0; i < w; { - select { - case <-done: - i++ - case <-time.After(1 * time.Second): - if len(jobs) == (1024 * 1024) { - i = w - } - } - } - - close(done) - close(jobs) -} |