Skip to content

Commit

Permalink
Refactor into wire formats and processors for structs/interfaces and …
Browse files Browse the repository at this point in the history
…graph processors respectively
  • Loading branch information
ronin13 committed Feb 12, 2017
1 parent 6addf01 commit 0378b8e
Show file tree
Hide file tree
Showing 9 changed files with 227 additions and 194 deletions.
35 changes: 18 additions & 17 deletions dotler/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/PuerkitoBio/goquery"
"github.com/PuerkitoBio/purell"
"github.com/golang/glog"
wire "github.com/ronin13/dotler/wire"

"context"
"net/url"
Expand All @@ -21,9 +22,9 @@ import (
// Iterates over attributes, parses the page,
// gets URLs from same domain, gets static assets
// sends new links onto reqChan.
func updateAttr(item *goquery.Selection, inPage *Page, attribTypes []string, reqChan chan *Page, nodes NodeMapper) error {
func updateAttr(item *goquery.Selection, inPage *wire.Page, attribTypes []string, reqChan chan *wire.Page, nodes wire.NodeMapper) error {

var nPage *Page
var nPage *wire.Page
var err error
var statTitle string
var parsedURL *url.URL
Expand Down Expand Up @@ -54,11 +55,11 @@ func updateAttr(item *goquery.Selection, inPage *Page, attribTypes []string, req
parsedURL.RawQuery = ""
parsedURL.Fragment = ""
if isStatic(parsedURL.String()) {
if _, exists := inPage.statList[parsedURL.String()]; !exists {
if _, exists := inPage.StatList[parsedURL.String()]; !exists {
statTitle = getStatTitle(parsedURL)
inPage.statList[parsedURL.String()] = StatPage{
staticURL: parsedURL,
pageTitle: statTitle}
inPage.StatList[parsedURL.String()] = wire.StatPage{
StaticURL: parsedURL,
PageTitle: statTitle}
}

} else if parsedURL.Host == base.Host {
Expand All @@ -72,7 +73,7 @@ func updateAttr(item *goquery.Selection, inPage *Page, attribTypes []string, req
// New discovery!

// Title not known at this point
nPage = &Page{PageURL: parsedURL}
nPage = &wire.Page{PageURL: parsedURL}

//TODO: go writeToChan?
writeToChan(nPage, reqChan)
Expand All @@ -89,12 +90,12 @@ func updateAttr(item *goquery.Selection, inPage *Page, attribTypes []string, req
return nil
}

func updateOutLinksWithCard(key string, iPage, nPage *Page) {
func updateOutLinksWithCard(key string, iPage, nPage *wire.Page) {

if _, exists := iPage.outLinks[key]; exists {
iPage.outLinks[key].card++
if _, exists := iPage.OutLinks[key]; exists {
iPage.OutLinks[key].Card++
} else {
iPage.outLinks[key] = &PageWithCard{page: nPage, card: 1}
iPage.OutLinks[key] = &wire.PageWithCard{Page: nPage, Card: 1}
}
}

Expand All @@ -103,7 +104,7 @@ func updateOutLinksWithCard(key string, iPage, nPage *Page) {
// For attributes: href and src
// Updates Page structure with static and outside links.
// Uses goquery for parsing.
func getAllLinks(cancelParse context.Context, inPage *Page, reqChan chan *Page, nodes NodeMapper) chan bool {
func getAllLinks(cancelParse context.Context, inPage *wire.Page, reqChan chan *wire.Page, nodes wire.NodeMapper) chan bool {

doneChan := make(chan bool, 1)

Expand All @@ -112,17 +113,17 @@ func getAllLinks(cancelParse context.Context, inPage *Page, reqChan chan *Page,
body, err := getContent(inPage.PageURL)
if err != nil {
glog.Infof("Failed to crawl %s", inPage.PageURL.String())
inPage.failCount++
if inPage.failCount <= maxFetchFail {
inPage.FailCount++
if inPage.FailCount <= maxFetchFail {
//TODO: go writeToChan?
writeToChan(inPage, reqChan)
}
doneChan <- false
return
}

inPage.outLinks = make(map[string]*PageWithCard)
inPage.statList = make(map[string]StatPage)
inPage.OutLinks = make(map[string]*wire.PageWithCard)
inPage.StatList = make(map[string]wire.StatPage)
doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
panicCrawl(err)

Expand Down Expand Up @@ -160,7 +161,7 @@ func getAllLinks(cancelParse context.Context, inPage *Page, reqChan chan *Page,
// Uses respChan for graph rendering.
// Also has a timeout of crawlThreshold.
// Uses a new child context noParse - used to terminate parsing.
func Crawl(cancelCrawl context.Context, inPage *Page, reqChan chan *Page, respChan chan *Page, waiter *sync.WaitGroup, nodes NodeMapper) {
func Crawl(cancelCrawl context.Context, inPage *wire.Page, reqChan chan *wire.Page, respChan chan *wire.Page, waiter *sync.WaitGroup, nodes wire.NodeMapper) {

defer waiter.Done()
if err := nodes.Add(inPage.PageURL.String(), inPage); err != nil {
Expand Down
43 changes: 19 additions & 24 deletions dotler/dotler.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
package dotler

import (
"github.com/awalterschulze/gographviz"
"github.com/golang/glog"
processor "github.com/ronin13/dotler/processor"
wire "github.com/ronin13/dotler/wire"

"context"
"fmt"
Expand Down Expand Up @@ -40,16 +41,14 @@ var (
ClientTimeout, crawlThreshold uint
domain string
termChannel chan struct{}
reqChan, dotChan chan *Page
reqChan, dotChan chan *wire.Page
maxFetchFail uint
crawlSuccess uint64
crawlFail uint64
crawlSkipped uint64
crawlCancelled uint64
)

var crawlGraph = gographviz.NewEscape()

// Signal handler!
// a) SIGTERM/SIGINT - gracefully shuts down the server.
func handleSignal(schannel chan os.Signal) {
Expand Down Expand Up @@ -94,11 +93,12 @@ func StartCrawl(startURL string) int {
var endTime int64
var once sync.Once
var wg sync.WaitGroup
var nodeMap wire.NodeMapper

var printerChan chan struct{}
var printerChan wire.GraphProcessor

crawlDone := make(chan struct{}, 2)
reqChan = make(chan *Page, MAXWORKERS)
reqChan = make(chan *wire.Page, MAXWORKERS)
termChannel = make(chan struct{}, 2)

if numThreads > 0 {
Expand All @@ -107,10 +107,6 @@ func StartCrawl(startURL string) int {
runtime.GOMAXPROCS(runtime.NumCPU())
}

crawlGraph.SetName("dotler")
crawlGraph.SetDir(true)
crawlGraph.SetStrict(true)

if showProg != "" {
glog.Infoln("Turning on gen-image")
genImage = true
Expand All @@ -126,7 +122,7 @@ func StartCrawl(startURL string) int {
parentContext := context.Background()
noCrawl, terminate := context.WithCancel(parentContext)

nodeMap := &NodeMap{make(chan *stringPage, numThreads), make(chan *existsPage, numThreads)}
nodeMap = wire.NewNodeMapper(numThreads)
go nodeMap.RunLoop(noCrawl)

sigs := make(chan os.Signal, 1)
Expand All @@ -138,11 +134,12 @@ func StartCrawl(startURL string) int {
if err != nil {
panic(fmt.Sprintf("Failed in parsing root url %s", err))
}
reqChan <- &Page{PageURL: parsedURL}
reqChan <- &wire.Page{PageURL: parsedURL}

if genGraph {
dotChan = make(chan *Page, MAXWORKERS)
printerChan = dotPrinter(noCrawl, dotChan)
dotChan = make(chan *wire.Page, MAXWORKERS)
printerChan = processor.NewPrinter()
printerChan.ProcessLoop(noCrawl, dotChan)
}
go handleSignal(sigs)

Expand All @@ -167,34 +164,33 @@ func StartCrawl(startURL string) int {

go func() {
<-termChannel
var dotString string

status := 0

if crawlDone != nil {
crawlDone <- struct{}{}
}
terminate()
// Stops the dot printer.
// TODO: it is
// close(reqChan)
if genGraph {
<-printerChan
dotString = <-printerChan.Result()
close(dotChan)
}
// This is safe.
wg.Wait()

glog.Flush()
dotString := crawlGraph.String()

err = ioutil.WriteFile("dotler.dot", []byte(dotString), 0644)
panicCrawl(err)
glog.Infof("We are done, phew!, persisting graph to dotler.dot\n")
if genGraph {
err = ioutil.WriteFile("dotler.dot", []byte(dotString), 0644)
panicCrawl(err)
glog.Infof("We are done, phew!, persisting graph to dotler.dot\n")
}

printStats()

if genImage {
status = postProcess()
status = postProcess(dotString)
}
extStatus <- status

Expand All @@ -214,7 +210,6 @@ func StartCrawl(startURL string) int {
reqChan = nil
crawlDone = nil
termChannel <- struct{}{}
//return <-extStatus

}

Expand Down
55 changes: 2 additions & 53 deletions dotler/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,71 +6,20 @@ package dotler

import (
//"github.com/golang/glog"
wire "github.com/ronin13/dotler/wire"

"context"
"fmt"
"log"
"net/url"
"strings"
)

type NodeMapper interface {
Exists(string) *Page
Add(string, *Page) error
}

func (node *NodeMap) RunLoop(stopLoop context.Context) {

pages := make(map[string]*Page)
for {
select {
case <-stopLoop.Done():
return
case addPage := <-node.addChan:
if _, exists := pages[addPage.key]; exists {
//glog.Errorf("Key %s already exists, value %s", addPage.key, val)
addPage.err <- fmt.Errorf("Key exists")
continue
}
pages[addPage.key] = addPage.value
addPage.err <- nil
case checkPage := <-node.checkChan:
if value, exists := pages[checkPage.key]; exists {
checkPage.value <- value
} else {
checkPage.value <- nil
}
}
}

}

// Needed for http/https sites to create smaller graph.
func httpStrip(input string) string {
return strings.Split(input, "//")[1]
}

func (node *NodeMap) Add(key string, value *Page) error {
skey := httpStrip(key)
sPage := &stringPage{skey, value, make(chan error, 1)}
node.addChan <- sPage
return <-sPage.err
}

func (node *NodeMap) Exists(key string) *Page {
skey := httpStrip(key)
sPage := &existsPage{key: skey, value: make(chan *Page, 1)}
node.checkChan <- sPage
return <-sPage.value
}

func panicCrawl(err error) {
if err != nil {
log.Fatalf("dotler has come to a halt due to %s", err)
}
}

func writeToChan(iPage *Page, inChan chan *Page) {
func writeToChan(iPage *wire.Page, inChan chan *wire.Page) {
// To prevent panic from closed channel during shutdown
// Yes, there are other safeguards, but real world is not perfect :)
defer func() { recover() }()
Expand Down
4 changes: 2 additions & 2 deletions dotler/postprocess.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import (
"strings"
)

func postProcess() int {
func postProcess(result string) int {

var err error
var graphPipe io.WriteCloser
Expand All @@ -31,7 +31,7 @@ func postProcess() int {

panicCrawl(graphIt.Start())

_, err = graphPipe.Write([]byte(crawlGraph.String()))
_, err = graphPipe.Write([]byte(result))
panicCrawl(err)

panicCrawl(graphPipe.Close())
Expand Down
Loading

0 comments on commit 0378b8e

Please sign in to comment.