aboutsummaryrefslogtreecommitdiff
path: root/CSC2636/webCrawler2
diff options
context:
space:
mode:
Diffstat (limited to 'CSC2636/webCrawler2')
-rw-r--r--CSC2636/webCrawler2/README.rst13
-rw-r--r--CSC2636/webCrawler2/crawler.go164
2 files changed, 177 insertions, 0 deletions
diff --git a/CSC2636/webCrawler2/README.rst b/CSC2636/webCrawler2/README.rst
new file mode 100644
index 0000000..1168fb9
--- /dev/null
+++ b/CSC2636/webCrawler2/README.rst
@@ -0,0 +1,13 @@
+===========
+Web Crawler
+===========
+
+Web crawler for Web Science class
+
+Dependencies
+============
+- `GoQuery <http://www.github.com/PuerkitoBio/goquery>`_.
+
+Authors
+=======
+- Tucker Evans
diff --git a/CSC2636/webCrawler2/crawler.go b/CSC2636/webCrawler2/crawler.go
new file mode 100644
index 0000000..5c4dba6
--- /dev/null
+++ b/CSC2636/webCrawler2/crawler.go
@@ -0,0 +1,164 @@
+package main
+
+import "crypto/md5"
+import "fmt"
+import "github.com/PuerkitoBio/goquery"
+import "log"
+import "net/url"
+import "os"
+import "strconv"
+import "strings"
+import "sync"
+import "sync/atomic"
+import "time"
+
+type link struct {
+ u *url.URL
+ depth int
+}
+
+var mutex *sync.Mutex
+var Prev map[string]bool
+var base string
+var links_visited uint64 = 0
+
+func validLink(s string) bool {
+ return true
+ //return (strings.HasSuffix(s, ".html") || strings.HasSuffix(s, "/") || strings.HasSuffix(s, "\\"))
+}
+
+func addLinks(doc *goquery.Document, jobs chan link, current link, depth int, worker_id int) {
+ doc.Find("body a").Each(func(index int, item *goquery.Selection) {
+ link_s, _ := item.Attr("href")
+
+ d := depth + 1
+
+ u, err := url.Parse(link_s)
+ if err != nil {
+ panic(err)
+ }
+
+ if !u.IsAbs() {
+ u = current.u.ResolveReference(u)
+ }
+ if strings.Contains(u.String(), base) && validLink(u.String()) {
+ mutex.Lock()
+ if !Prev[u.String()] {
+ jobs <- link{u, d}
+ Prev[u.String()] = true
+ }
+ mutex.Unlock()
+ }
+ })
+}
+
+func consume(doc *goquery.Document, url link, worker_id int) {
+ f, _ := os.Create(fmt.Sprintf("./pages/%x", md5.Sum([]byte(url.u.String()))))
+ s, _ := doc.Html()
+ f.Write([]byte(s))
+}
+
+func worker(done chan bool, jobs chan link, depth int, id int, total uint64) {
+ for {
+ x := atomic.LoadUint64(&links_visited)
+ if x >= total {
+ done <- true
+ return
+ }
+
+ atomic.AddUint64(&links_visited, 1)
+ select {
+ case j := <-jobs:
+ if j.depth < depth {
+ doc, err := goquery.NewDocument(j.u.String())
+ if err != nil {
+ log.Print("Error Reading Document: " + j.u.String() + err.Error())
+ break
+ }
+
+ fmt.Printf("worker %d Working on %s...\n", id, j.u.String())
+
+ consume(doc, j, id)
+ addLinks(doc, jobs, j, j.depth, id)
+ }
+ case <-time.After(time.Second * 10):
+ fmt.Printf("Worker %d done\n", id)
+ done <- true
+ return
+ }
+ }
+}
+
+func init() {
+ mutex = &sync.Mutex{}
+ Prev = make(map[string]bool)
+ var err error
+
+ fi, err := os.Lstat("./pages");
+ if err != nil {
+ fmt.Printf("INIT ERROR: %s\n", err);
+ }
+
+ if (fi == nil) {
+ os.Mkdir("./pages", 0755);
+ } else if (fi.Mode().IsRegular()) {
+ panic("pages is not a valid directory\n")
+ }
+
+}
+
+func main() {
+ var d, w, b int
+ var t uint64
+
+ if len(os.Args) < 5 {
+ fmt.Printf("usage: crawler url depth max_links workers\n")
+ panic("test")
+ }
+
+ base = strings.TrimPrefix(os.Args[1], "http://www.")
+ base = strings.TrimPrefix(base, "https://www.")
+ if base == os.Args[1] {
+ panic(base)
+ }
+
+ d, _ = strconv.Atoi(os.Args[2])
+ b, _ = (strconv.Atoi(os.Args[3]))
+ t = uint64(b)
+ b, _ = (strconv.Atoi(os.Args[3]))
+ t = uint64(b)
+ w, _ = strconv.Atoi(os.Args[4])
+
+ jobs := make(chan link, 1024*1024)
+ done := make(chan bool)
+
+ u, err := url.Parse(os.Args[1])
+ if err != nil {
+ panic(err)
+ }
+
+ if !u.IsAbs() {
+ panic("Cannot start with relative url")
+ }
+ jobs <- link{u, 0}
+
+ //send first job
+
+ for i := 0; i < w; i++ {
+ go worker(done, jobs, d, i, t)
+ }
+
+ for i := 0; i < w; {
+ select {
+ case <-done:
+ i++
+ case <-time.After(1 * time.Second):
+ if len(jobs) == (1024 * 1024) {
+ i = w
+ }
+ }
+ }
+
+ close(done)
+ close(jobs)
+}