package main

import "os"
import "sort"
import "golang.org/x/net/html"
import "log"
import "fmt"
import "github.com/PuerkitoBio/goquery"
import "github.com/kennygrant/sanitize"
import "strings"
import "flag"
import "errors"
import "regexp"

type document struct {
	fname string;
	title []string;
	text []string;
}

type index struct {
	doc *document;
	title bool;
	freq int;
}

type wordSort struct {
	w string;
	root *wordList;
}

type wordList struct {
	this *index
	next *wordList
}

var r, nonAN, stopWords *regexp.Regexp;


func newDocument() *document {
	return &document{"" , nil, nil};
}

func RemoveNode(r, rn *html.Node) {
	var found bool;
	var n, item *html.Node;
	var nodes map[int]*html.Node;
	var i, j int;

	found = false;
	nodes = make(map[int]*html.Node);

	for n = r.FirstChild; n != nil; n = n.NextSibling {
		if n == rn {
			found = true;
			n.Parent.RemoveChild(n);
		}

		nodes[i] = n;
		i++;
	}

	if !found {
		for j = 0; j < i; j++ {
			item = nodes[j];
			RemoveNode(item, rn);
		}
	}
}
func RemoveTag(doc *goquery.Selection, tag string) {
	doc.Find(tag).Each(func(i int, s *goquery.Selection) {
		RemoveNode(doc.Get(0), s.Get(0));
	});
}

func logReg(h []byte) []byte {
	log.Printf("RegExp: %s", h);
	return h;
}

func parseDoc(fd *os.File, f_info os.FileInfo) (*document, error) {
	var err error;
	var text, t_text string;
	var doc *goquery.Document;
	var body, title *goquery.Selection;
	var r_doc *document;

	doc, err = goquery.NewDocumentFromReader(fd);
	if err != nil {
		log.Printf("goquery error: %s\n", err);
		return nil, errors.New("Can't create goquery documnt");
	}

	body = doc.Find("body");
	RemoveTag(body, "script");
	RemoveTag(body, "noscript");

	title = doc.Find("title");

	//TODO add error detection
	text, err = body.Html();
	t_text, err = title.Html();


	text = r.ReplaceAllString(text, "> <");
	t_text = r.ReplaceAllString(text, "> <");

	text = sanitize.HTML(text);
	t_text = sanitize.HTML(t_text);

	text = nonAN.ReplaceAllString(text, " ");
	t_text = nonAN.ReplaceAllString(t_text, " ");

	text = stopWords.ReplaceAllString(text, "");
	t_text = stopWords.ReplaceAllString(t_text, "");

	r_doc = newDocument();
	r_doc.fname = f_info.Name();
	r_doc.text = strings.Fields(sanitize.HTML(text));
	r_doc.title = strings.Fields(sanitize.HTML(t_text));

	return r_doc, nil;
}
func boolToInt(t bool) int {
	if t {
		return 1;
	}
	return 0;
}

func printIndex(words []wordSort, fd *os.File) {
	var i int;
	var cur *wordList;
	var fname string;
	var t int;
	var freq float64;

	for i = 0; i < len(words); i++ {
		fmt.Fprintf(fd, "%s\n", words[i].w);
		for cur = words[i].root; cur != nil; cur = cur.next {
			fname = cur.this.doc.fname;
			t = boolToInt(cur.this.title);
			freq = float64(cur.this.freq) / float64(len(cur.this.doc.text));
			
			fmt.Fprintf(fd,"\t%s %d %.3f\n", fname, t, freq);
		}
	}
}

func init() {
	log.SetOutput(os.Stderr);
	r, _ = regexp.Compile("><");
	nonAN, _ = regexp.Compile("[^a-zA-Z0-9]+");
	stopWords, _ = regexp.Compile("( and\\W)|( a\\W)|( an\\W)|( and\\W)|( are\\W)|( as\\W)|( at\\W)|( be\\W)|( by\\W)|( for\\W)|( from\\W)|( has\\W)|( he\\W)|( in\\W)|( is\\W)|( it\\W)|( its\\W)|( of\\W)|( on\\W)|( that\\W)|( the\\W)|( to\\W)|( was\\W)|( were\\W)|( will\\W)|( with\\W)")
}

func main() {
	//	var words map[string]index
	var p_dir, w, fname string;
	var err error;
	var i, j int;
	var words map[string]*wordList;
	var cur *wordList;
	var tmp *index;
	var sorted []wordSort;

	var files []os.FileInfo;
	var dir, fd *os.File;
	var dir_info, fd_info os.FileInfo;
	var dir_mode os.FileMode;

	var doc *document;

	flag.StringVar(&p_dir, "d", "./pages", "pages directory");

	flag.Parse();

	words = make(map[string]*wordList);

	dir, err = os.Open(p_dir);
	if err != nil {
		log.Printf("Error accessing \"%s\":\t%s\n", p_dir, err);
		os.Exit(1);
	}

	dir_info, err = dir.Stat();
	dir_mode = dir_info.Mode();

	if !dir_mode.IsDir() {
		log.Printf("\"%s\" is not a directory\n", p_dir);
		os.Exit(1);
	}

	files, err = dir.Readdir(0);
	if err != nil {
		log.Printf("Error reading %s\n", p_dir);
		os.Exit(1);
	}

	for i = 0; i < len(files); i++ {
		fd, err = os.Open(fmt.Sprintf("%s/%s", dir_info.Name(), files[i].Name()));
		fd_info, err = fd.Stat();
		if err != nil {
			log.Printf("Error getting info\n");
			os.Exit(1);
		}
		fname = fd_info.Name();

		if err != nil {
			log.Printf("Error reading %s/%s\n", dir_info.Name(), files[i].Name());
		} else {
			fmt.Printf("Indexing %s...\n", fname);
			doc, err = parseDoc(fd, fd_info);
			if err != nil {
				log.Printf("Error parsing %s/%s\n", dir_info.Name(), files[i].Name());
			} else {
				/* Text */
				for j = 0; j < len(doc.text); j++ {
					w = strings.ToLower(doc.text[j]);

					if words[w] == nil{
						tmp = &index{doc: doc, title: false, freq: 1};
						words[w] = &wordList{this: tmp, next: nil};
					}

					for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{}

					if cur.this.doc.fname == fname {
						cur.this.freq++
					} else if cur.next == nil {
						tmp = &index{doc: doc, title: false, freq: 1};
						cur.next = &wordList{this: tmp, next: nil};
					} else {
						panic(fmt.Sprintf("%v", cur));
					}
				}
				/* Title */
				for j = 0; j < len(doc.title); j++ {
					w = strings.ToLower(doc.title[j]);

					if words[w] == nil{
						tmp = &index{doc: doc, title: true, freq: 1};
						words[w] = &wordList{this: tmp, next: nil};
					}

					for cur = words[w];cur.next != nil && cur.this.doc.fname != fname; cur = cur.next{}

					if cur.this.doc.fname == fname {
						cur.this.title = true;
						cur.this.freq++;
					} else if cur.next == nil {
						tmp = &index{doc: doc, title: true, freq: 1};
						cur.next = &wordList{this: tmp, next: nil};
					} else {
						panic(fmt.Sprintf("%v", cur));
					}
				}
			}
		}
		fd.Close();
	}
		sorted = make([]wordSort, len(words));
		i = 0;
		for k,v := range words {
			sorted[i].w = k;
			sorted[i].root = v;
			i++;
		}
		
		sort.Slice(sorted, func(i, j int) bool {
			return sorted[i].w < sorted[j].w;
		});
		
		fd,_ = os.Create("index.dat");
		printIndex(sorted, fd);
		fd.Close();
}