diff options
author | Tucker Evans <tuckerevans24@gmail.com> | 2017-12-06 01:06:05 -0500 |
---|---|---|
committer | Tucker Evans <tuckerevans24@gmail.com> | 2017-12-17 13:20:31 -0500 |
commit | 1a2153af95331cd77803d40633f581e1f2b6507e (patch) | |
tree | 328432642dda4b66144e169458a904d93e348668 | |
parent | d0a50b453fab206905598181173a2cfbefb2eea5 (diff) |
CSC2621/assignments/search: Started on parse function, still errors: see TODO
-rw-r--r-- | search/assign.rst | 161 | ||||
-rw-r--r-- | search/index.go | 106 |
2 files changed, 211 insertions, 56 deletions
diff --git a/search/assign.rst b/search/assign.rst index 4f3ba45..70d0b08 100644 --- a/search/assign.rst +++ b/search/assign.rst @@ -4,18 +4,41 @@ Project 2: Search Engine **CS2621– Web Science** -**100 points** - -You are to create a web search engine that works at the command line. To do this, you will write two Python scripts, -indexer.py and search.py. Indexer.py should do the following: - -1. After performing a crawl (using your other Python script), read all the HTML files that were stored in the “pages” directory. For each document, extract the title and the text from the body of the page (read the Beautiful Soup documentation to find out how). Beautiful Soup will include the text of the page in the content of the page, and that is OK. Beautiful Soup may also break on some pages and include HTML as text, but we will not worry about these exceptions or bugs. - -2. All text should be converted to lowercase and non-alphanumeric characters should be ignored. So “123-456” would become “123” and “456”, and “joe@yahoo.com” would become “joe”, “yahoo”, “com”. Ignore the following stop words: a, an, and, are, as, at, be, by, for, from, has, he, in, is, it, its, of, on, that, the, to, was, were, will, with. Do not perform stemming. - -3. A single inverted index should be created for the document corpus which maintains the document ID (numbered 1…n in order of the pages found in the “pages” directory), a 1 or 0 if the text is found in the title, and the term frequency from the body (normalized by the total number of tokens in the document after removing stop words). - -4. After indexer.py has finished indexing all the web pages, it should output the index to index.dat which looks likethis: +*100 points* + +You are to create a web search engine that works at the command line. +To do this, you will write two Python scripts, indexer.py and +search.py. + +Indexer +======= + +Indexer.py should do the following: + +1. After performing a crawl (using your other Python script), read all + the HTML files that were stored in the “pages” directory. For each + document, extract the title and the text from the body of the page + (read the Beautiful Soup documentation to find out how). Beautiful + Soup will include the text of the page in the content of the page, + and that is OK. Beautiful Soup may also break on some pages and + include HTML as text, but we will not worry about these + exceptions or bugs. + +2. All text should be converted to lowercase and non-alphanumeric + characters should be ignored. So “123-456” would become “123” and + “456”, and “joe@yahoo.com” would become “joe”, “yahoo”, “com”. + Ignore the following stop words: a, an, and, are, as, at, be, by, + for, from, has, he, in, is, it, its, of, on, that, the, to, was, + were, will, with. Do not perform stemming. + +3. A single inverted index should be created for the document corpus + which maintains the document ID (numbered 1…n in order of the pages + found in the “pages” directory), a 1 or 0 if the text is found in + the title, and the term frequency from the body (normalized by the + total number of tokens in the document after removing stop words). + +4. After indexer.py has finished indexing all the web pages, it should + output the index to index.dat which looks likethis: :: @@ -32,17 +55,24 @@ indexer.py and search.py. Indexer.py should do the following: etc… .. note :: - The indexed words are alphabetized, and there are 3 spaces before sets of three numbers (each separated - by a single space) which are: doc ID, title (0 or 1), and normalized body TF (rounded to 3 decimal places). For - example, the term white was found only in document 2; it was somewhere in the title and made up 1.8% of all the - words in the document. + The indexed words are alphabetized, and there are 3 spaces before + sets of three numbers (each separated by a single space) which are: + doc ID, title (0 or 1), and normalized body TF (rounded to 3 decimal + places). For example, the term white was found only in document 2; + it was somewhere in the title and made up 1.8% of all the words in + the document. + +5. It may take some time for your program to run, so you should output + information about the program’s status as it indexes the crawled + pages. Outputting what file is being worked on would be helpful to + the user who is waiting for the program to finish its work. -5. It may take some time for your program to run, so you should output information about the program’s status as it -indexes the crawled pages. Outputting what file is being worked on would be helpful to the user who is waiting for -the program to finish its work. +Search +====== -After the index is written to index.dat, the search.py script will allow the user to search the corpus for specific words. -Here is how it should operate: +After the index is written to index.dat, the search.py script will +allow the user to search the corpus for specific words. Here is how +it should operate: 1. First, read the search phrase at the command line. Examples: @@ -51,31 +81,40 @@ Here is how it should operate: $ search.py bisons $ search.py "landmark college" -If no command line argument is supplied, the program should tell the user a search term is required and terminate. -Ignore any command-line arguments after the first. +If no command line argument is supplied, the program should tell the +user a search term is required and terminate. Ignore any command-line +arguments after the first. -2. Next, the program should read the index from index.dat into memory. Note that you may want to use similar data -structures used in indexer.py, so you should write your programs in a way where you share code without having -redundant code in each script. (It’s OK to introduce new .py files to your project.) +2. Next, the program should read the index from index.dat into memory. + Note that you may want to use similar data structures used in + indexer.py, so you should write your programs in a way where you + share code without having redundant code in each script. (It’s OK + to introduce new .py files to your project.) -3. For simplicity, all queries will be assumed to use boolean ANDs, and we will not implement phrase search. For -example, the query landmark college should generate a boolean search for landmark AND college, so only -documents containing both terms should be considered amatch. +3. For simplicity, all queries will be assumed to use boolean ANDs, + and we will not implement phrase search. For example, the query + landmark college should generate a boolean search for landmark AND + college, so only documents containing both terms should be + considered amatch. -4. Remove any stop words from the query as was done when indexing the documents. +4. Remove any stop words from the query as was done when indexing the + documents. -5. After determining which documents match the search terms, calculate the relevancy score for each document: -relevancy score = 0.9 * body TF + 0.1 * title score -Do this for each term, and compute the average relevancy score for all terms. So if the search was for landmark -college, you would compute the score for landmark and the score for college and compute the average to -determine the overall relevancy score. +5. After determining which documents match the search terms, calculate + the relevancy score for each document: relevancy score = 0.9 * body + TF + 0.1 * title score Do this for each term, and compute the + average relevancy score for all terms. So if the search was for + landmark college, you would compute the score for landmark and the + score for college and compute the average to determine the overall + relevancy score. -6. The total number of results should first be displayed. Then display every document ID and score (out to 3 decimal -places) ordered by score, and number the results. Example: -Results: +6. The total number of results should first be displayed. Then display + every document ID and score (out to 3 decimal places) ordered by + score, and number the results. Example: -:: +.. code:: bash + Results: 1. docID 2. docID 3. docID @@ -97,18 +136,21 @@ Results: 0.350 0.108 -Bonus: You can receive 5 bonus points by implementing phrase search. So when the user searches for “landmark -college”, assume they want only documents with that exact phrase. To accomplish this, you will need to store the -positions of the terms that are stored in the inverted index. Then use those positions to ensure the phrase matches -successive positions. -Zip your entire project directory and submit it to Canvas before it is due. Make sure your output matches the specifications -precisely to avoid losing any points. If you use any code you find in the Web, you must document the source in your +**Bonus:** You can receive 5 bonus points by implementing phrase search. +So when the user searches for “landmark college”, assume they want +only documents with that exact phrase. To accomplish this, you will +need to store the positions of the terms that are stored in the +inverted index. Then use those positions to ensure the phrase matches +successive positions. Zip your entire project directory and submit it +to Canvas before it is due. Make sure your output matches the +specifications precisely to avoid losing any points. If you use any +code you find in the Web, you must document the source in your program. Test Data ========= -a.html +*a.html* .. code:: html @@ -117,7 +159,7 @@ a.html this 123-456. </body> -b.html +*b.html* .. code:: html @@ -130,7 +172,7 @@ b.html </body> </html> -c.html +*c.html* .. code:: html @@ -138,7 +180,7 @@ c.html This is a test. </body> -Inverted index: +*Inverted index:* .. code:: @@ -163,15 +205,22 @@ Inverted index: 2 0 0.500 Search for "test this" results in the following: -Results: 2 -1. docID 2, score 0.450 -2. docID 1, score 0.230 + +:: + + Results: 2 + 1. docID 2, score 0.450 + 2. docID 1, score 0.230 Search for "test patriots go" results in the following: -Results: 1 -1. docID 3, score 0.310 + +:: + + Results: 1 + 1. docID 3, score 0.310 Search for "cool patriots" results in the following: -Results: 0 - +:: + + Results: 0 diff --git a/search/index.go b/search/index.go index 3df9547..7f202fb 100644 --- a/search/index.go +++ b/search/index.go @@ -1,10 +1,116 @@ package main + import "os" +import "log" import "fmt" import "github.com/PuerkitoBio/goquery" import "strings" +import "flag" +import "errors" + +type index struct { + doc string; + title bool; + freq int; +} + +type document struct{ + title []string; + text []string; +} + + +func newDocument() (*document) { + return &document{nil, nil}; +} + +func parseDoc(fd *os.File) (*document, error) { + var err error; + var text, t_text string; + var doc *goquery.Document; + var body, title *goquery.Selection; + var r_doc *document; + + doc, err = goquery.NewDocumentFromReader(fd); + if (err != nil) { + log.Printf("goquery error: %s\n", err); + return nil, errors.New("Can't create goquery documnt"); + } + + //TODO test kennygrant/sanatize instead of goquery + body = doc.Find("body").Not("style").Not("script"); + title = doc.Find("title"); + + text = body.Text(); + t_text = title.Text(); + + r_doc = newDocument(); + + r_doc.text = strings.Fields(text); + r_doc.title = strings.Fields(t_text); + if (len(r_doc.text) == 1) { + log.Printf("not splittin!!!!!!!!!!!\n"); + os.Exit(1); + } + + return r_doc, nil; +} + +func init() { + log.SetOutput(os.Stderr); +} func main() { +// var words map[string]index + var p_dir string//, fname string; + var err error; + var i int; + + var files []os.FileInfo; + var dir, fd *os.File; + var dir_info os.FileInfo; + var dir_mode os.FileMode; + + var doc *document; + + flag.StringVar(&p_dir, "d", "./pages", "pages directory"); + + flag.Parse(); + + dir, err = os.Open(p_dir); + if (err != nil) { + log.Printf("Error accessing \"%s\":\t%s\n", p_dir, err); + os.Exit(1); + } + + dir_info, err = dir.Stat(); + dir_mode = dir_info.Mode(); + + if (!dir_mode.IsDir()) { + log.Printf("\"%s\" is not a directory\n", p_dir); + os.Exit(1); + } + + files, err = dir.Readdir(0); + if (err != nil) { + log.Printf("Error reading %s\n", p_dir); + os.Exit(1); + } + + for i=0; i < len(files) && i < 1; i++ { + fd, err = os.Open(fmt.Sprintf("%s/%s", dir_info.Name(), files[i].Name())); + if (err != nil) { + log.Printf("Error reading %s/%s\n", dir_info.Name(), files[i].Name()); + } else { + doc, err = parseDoc(fd); + if (err != nil) { + log.Printf("Error parsing %s/%s\n", dir_info.Name(), files[i].Name()); + } else { + fmt.Println(doc.text); + fmt.Println(doc.title); + } + } + } } |