aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTucker Evans <tuckerevans24@gmail.com>2017-12-02 16:18:11 -0500
committerTucker Evans <tuckerevans24@gmail.com>2019-02-17 20:26:23 -0500
commitde031c361413fccae6e9c516ca9866825d350253 (patch)
treebf6b080fd46bae6aee8b6d19b18c5ea9fec29fbe
parentfb3b6cda7be526cca46d56e23a8d355d25c7287a (diff)
CSC2621/assignments/webCrawler2: Now will make pages directory; Removed some debugging print statements
-rw-r--r--webCrawler2/crawler.go65
1 files changed, 32 insertions, 33 deletions
diff --git a/webCrawler2/crawler.go b/webCrawler2/crawler.go
index 8725df8..1c7bc70 100644
--- a/webCrawler2/crawler.go
+++ b/webCrawler2/crawler.go
@@ -1,16 +1,16 @@
package main
import "crypto/md5"
-import "sync/atomic"
-import "os"
import "fmt"
+import "github.com/PuerkitoBio/goquery"
import "log"
-import "time"
-import "sync"
+import "net/url"
+import "os"
import "strconv"
import "strings"
-import "net/url"
-import "github.com/PuerkitoBio/goquery"
+import "sync"
+import "sync/atomic"
+import "time"
type link struct {
u *url.URL
@@ -19,7 +19,6 @@ type link struct {
var mutex *sync.Mutex
var Prev map[string]bool
-var tFile *os.File
var base string
var links_visited uint64 = 0
@@ -63,7 +62,6 @@ func worker(done chan bool, jobs chan link, depth int, id int, total uint64) {
for {
x := atomic.LoadUint64(&links_visited)
if x >= total {
- fmt.Printf("Max links reached\n\tlinks: %d total: %d\n", x, total)
done <- true
return
}
@@ -79,7 +77,7 @@ func worker(done chan bool, jobs chan link, depth int, id int, total uint64) {
break
}
- fmt.Printf("Adding links from %s depth: %d...\n", j.u.String(), j.depth)
+ fmt.Printf("worker %d Working on %s...\n", id, j.u.String())
consume(doc, j, id)
addLinks(doc, jobs, j, j.depth, id)
@@ -95,18 +93,26 @@ func init() {
mutex = &sync.Mutex{}
Prev = make(map[string]bool)
var err error
- tFile, err = os.Create("./test.txt")
+
+ fi, err := os.Lstat("./pages");
if err != nil {
- panic(err)
+ fmt.Printf("INIT ERROR: %s\n", err);
+ }
+
+ if (fi == nil) {
+ os.Mkdir("./pages", 0755);
+ } else if (fi.Mode().IsRegular()) {
+ panic("pages is not a valid directory\n")
}
+
}
func main() {
var d, w, b int
var t uint64
- if len(os.Args) < 4 {
- fmt.Printf("usage: crawler url [depth max_links] [workers]\n")
+ if len(os.Args) < 5 {
+ fmt.Printf("usage: crawler url depth max_links workers\n")
panic("test")
}
@@ -119,18 +125,9 @@ func main() {
d, _ = strconv.Atoi(os.Args[2])
b, _ = (strconv.Atoi(os.Args[3]))
t = uint64(b)
- if len(os.Args) >= 4 {
- b, _ = (strconv.Atoi(os.Args[3]))
- t = uint64(b)
-
- } else {
- t = 10
- }
- if len(os.Args) == 5 {
- w, _ = strconv.Atoi(os.Args[4])
- } else {
- w = 4
- }
+ b, _ = (strconv.Atoi(os.Args[3]))
+ t = uint64(b)
+ w, _ = strconv.Atoi(os.Args[4])
links := make(chan link, 1024*1024)
docs := make(chan *goquery.Document, 100)
@@ -152,15 +149,17 @@ func main() {
go worker(done, jobs, d, i, t)
}
- for i := 0; i < w*2; {
- <-done
-
- fmt.Printf("%d done\n", i)
- i++
+ for i := 0; i < w; {
+ select {
+ case <-done:
+ i++
+ case <-time.After(1 * time.Second):
+ if len(jobs) == (1024 * 1024) {
+ i = w
+ }
+ }
}
- tFile.Close()
close(done)
- close(links)
- close(docs)
+ close(jobs)
}