diff options
Diffstat (limited to 'CSC2636/webCrawler2')
-rw-r--r-- | CSC2636/webCrawler2/README.rst | 13 | ||||
-rw-r--r-- | CSC2636/webCrawler2/crawler.go | 164 |
2 files changed, 177 insertions, 0 deletions
diff --git a/CSC2636/webCrawler2/README.rst b/CSC2636/webCrawler2/README.rst new file mode 100644 index 0000000..1168fb9 --- /dev/null +++ b/CSC2636/webCrawler2/README.rst @@ -0,0 +1,13 @@ +=========== +Web Crawler +=========== + +Web crawler for Web Science class + +Dependencies +============ +- `GoQuery <http://www.github.com/PuerkitoBio/goquery>`_. + +Authors +======= +- Tucker Evans diff --git a/CSC2636/webCrawler2/crawler.go b/CSC2636/webCrawler2/crawler.go new file mode 100644 index 0000000..5c4dba6 --- /dev/null +++ b/CSC2636/webCrawler2/crawler.go @@ -0,0 +1,164 @@ +package main + +import "crypto/md5" +import "fmt" +import "github.com/PuerkitoBio/goquery" +import "log" +import "net/url" +import "os" +import "strconv" +import "strings" +import "sync" +import "sync/atomic" +import "time" + +type link struct { + u *url.URL + depth int +} + +var mutex *sync.Mutex +var Prev map[string]bool +var base string +var links_visited uint64 = 0 + +func validLink(s string) bool { + return true + //return (strings.HasSuffix(s, ".html") || strings.HasSuffix(s, "/") || strings.HasSuffix(s, "\\")) +} + +func addLinks(doc *goquery.Document, jobs chan link, current link, depth int, worker_id int) { + doc.Find("body a").Each(func(index int, item *goquery.Selection) { + link_s, _ := item.Attr("href") + + d := depth + 1 + + u, err := url.Parse(link_s) + if err != nil { + panic(err) + } + + if !u.IsAbs() { + u = current.u.ResolveReference(u) + } + if strings.Contains(u.String(), base) && validLink(u.String()) { + mutex.Lock() + if !Prev[u.String()] { + jobs <- link{u, d} + Prev[u.String()] = true + } + mutex.Unlock() + } + }) +} + +func consume(doc *goquery.Document, url link, worker_id int) { + f, _ := os.Create(fmt.Sprintf("./pages/%x", md5.Sum([]byte(url.u.String())))) + s, _ := doc.Html() + f.Write([]byte(s)) +} + +func worker(done chan bool, jobs chan link, depth int, id int, total uint64) { + for { + x := atomic.LoadUint64(&links_visited) + if x >= total { + done <- true + return + } + + atomic.AddUint64(&links_visited, 1) + select { + case j := <-jobs: + if j.depth < depth { + doc, err := goquery.NewDocument(j.u.String()) + if err != nil { + log.Print("Error Reading Document: " + j.u.String() + err.Error()) + break + } + + fmt.Printf("worker %d Working on %s...\n", id, j.u.String()) + + consume(doc, j, id) + addLinks(doc, jobs, j, j.depth, id) + } + case <-time.After(time.Second * 10): + fmt.Printf("Worker %d done\n", id) + done <- true + return + } + } +} + +func init() { + mutex = &sync.Mutex{} + Prev = make(map[string]bool) + var err error + + fi, err := os.Lstat("./pages"); + if err != nil { + fmt.Printf("INIT ERROR: %s\n", err); + } + + if (fi == nil) { + os.Mkdir("./pages", 0755); + } else if (fi.Mode().IsRegular()) { + panic("pages is not a valid directory\n") + } + +} + +func main() { + var d, w, b int + var t uint64 + + if len(os.Args) < 5 { + fmt.Printf("usage: crawler url depth max_links workers\n") + panic("test") + } + + base = strings.TrimPrefix(os.Args[1], "http://www.") + base = strings.TrimPrefix(base, "https://www.") + if base == os.Args[1] { + panic(base) + } + + d, _ = strconv.Atoi(os.Args[2]) + b, _ = (strconv.Atoi(os.Args[3])) + t = uint64(b) + b, _ = (strconv.Atoi(os.Args[3])) + t = uint64(b) + w, _ = strconv.Atoi(os.Args[4]) + + jobs := make(chan link, 1024*1024) + done := make(chan bool) + + u, err := url.Parse(os.Args[1]) + if err != nil { + panic(err) + } + + if !u.IsAbs() { + panic("Cannot start with relative url") + } + jobs <- link{u, 0} + + //send first job + + for i := 0; i < w; i++ { + go worker(done, jobs, d, i, t) + } + + for i := 0; i < w; { + select { + case <-done: + i++ + case <-time.After(1 * time.Second): + if len(jobs) == (1024 * 1024) { + i = w + } + } + } + + close(done) + close(jobs) +} |