aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--webCrawler2/crawler.go162
1 files changed, 162 insertions, 0 deletions
diff --git a/webCrawler2/crawler.go b/webCrawler2/crawler.go
new file mode 100644
index 0000000..5c41b44
--- /dev/null
+++ b/webCrawler2/crawler.go
@@ -0,0 +1,162 @@
+package main
+
+import "os"
+import "fmt"
+import "log"
+import "time"
+import "sync"
+import "strconv"
+import "strings"
+import "net/url"
+import "github.com/PuerkitoBio/goquery"
+
+type link struct {
+ u *url.URL
+ depth int
+}
+
+var mutex *sync.Mutex
+var Prev map[string]bool
+var tFile *os.File
+var base string
+
+func validLink(s string) bool {
+ return true
+ //return (strings.HasSuffix(s, ".html") || strings.HasSuffix(s, "/") || strings.HasSuffix(s, "\\"))
+}
+
+func addLinks(doc *goquery.Document, jobs chan link, current link, depth int, worker_id int) {
+ doc.Find("body a").Each(func(index int, item *goquery.Selection) {
+ link_s, _ := item.Attr("href")
+
+ d := depth + 1
+
+ u, err := url.Parse(link_s)
+ if err != nil {
+ panic(err)
+ }
+
+ if !u.IsAbs() {
+ u = current.u.ResolveReference(u)
+ }
+ if strings.Contains(u.String(), base) && validLink(u.String()) {
+ mutex.Lock()
+ if !Prev[u.String()] {
+ jobs <- link{u, d}
+ Prev[u.String()] = true
+ }
+ mutex.Unlock()
+ } else {
+ fmt.Printf("%s Failed: contains: %v 2: %v\n", u.String(), strings.Contains(u.String(), base), validLink(u.String()))
+ }
+ })
+}
+
+func consume(doc *goquery.Document, worker_id int) {
+ var title string
+ doc.Find("title").Each(func(index int, item *goquery.Selection) {
+ title = item.Text()
+ })
+ fmt.Println(title)
+}
+
+func worker(done chan bool, jobs chan link, depth int, id int) {
+ for {
+ fmt.Printf("%d Waiting... buffered: %d\n", id, len(jobs))
+ select {
+ case j := <-jobs:
+ if j.depth < depth {
+ doc, err := goquery.NewDocument(j.u.String())
+ if err != nil {
+ log.Print("Error Reading Document: " + j.u.String() + err.Error())
+ break
+ }
+
+ fmt.Printf("worker %d Working on %s depth: %d...\n", id, j.u.String(), j.depth)
+
+ consume(doc, id)
+ fmt.Println("Adding Links")
+ addLinks(doc, jobs, j, j.depth, id)
+ fmt.Println("Done Adding Links")
+ }
+ case <-time.After(time.Second * 10):
+ fmt.Printf("Worker %d done\n", id)
+ done <- true
+ return
+ }
+ }
+}
+
+func init() {
+ mutex = &sync.Mutex{}
+ Prev = make(map[string]bool)
+ var err error
+ tFile, err = os.Create("./test.txt")
+ if err != nil {
+ panic(err)
+ }
+}
+
+func main() {
+ var d, w int
+
+ if len(os.Args) < 3 {
+ fmt.Printf("usage: crawler url depth [workers]\n")
+ panic("test")
+ }
+
+ base = strings.TrimPrefix(os.Args[1], "http://www.")
+ base = strings.TrimPrefix(base, "https://www.")
+ if base == os.Args[1] {
+ panic(base)
+ }
+ fmt.Println(base)
+
+ d, _ = strconv.Atoi(os.Args[2])
+
+ if len(os.Args) == 4 {
+ w, _ = strconv.Atoi(os.Args[3])
+ } else {
+ w = 4
+ }
+
+ jobs := make(chan link, 1024*1024)
+ done := make(chan bool)
+
+ u, err := url.Parse(os.Args[1])
+ if err != nil {
+ panic(err)
+ }
+
+ if !u.IsAbs() {
+ panic("Cannot start with relative url")
+ }
+ fmt.Println(u)
+ fmt.Println(u.RawPath)
+ fmt.Println(u.String())
+ fmt.Println("jobs")
+ jobs <- link{u, 0}
+
+ //send first job
+
+ for i := 0; i < w; i++ {
+ go worker(done, jobs, d, i)
+ }
+
+ for i := 0; i < w; {
+ select {
+ case <-done:
+ fmt.Printf("%d done\n", i)
+ i++
+ case <-time.After(1 * time.Second):
+ if len(jobs) == (1024 * 1024) {
+ i = w
+ }
+ }
+ }
+
+ tFile.Close()
+ close(done)
+ fmt.Println(len(jobs))
+ close(jobs)
+}