Skip to content

Commit

Permalink
Merge branch 'master' of github.com:sjdirect/abot
Browse files Browse the repository at this point in the history
  • Loading branch information
sjdirect committed Mar 24, 2015
2 parents b64afb3 + cf8f359 commit c35755c
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 28 deletions.
18 changes: 9 additions & 9 deletions Abot/Core/CsQueryHyperLinkParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,6 @@ protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage)
return hrefValues.Concat(canonicalHref);
}

protected bool HasRelCanonicalPointingToDifferentUrl(IDomElement e, string orginalUrl)
{
return e.HasAttribute("rel") && !string.IsNullOrWhiteSpace(e.Attributes["rel"]) &&
string.Equals(e.Attributes["rel"], "canonical", StringComparison.OrdinalIgnoreCase) &&
e.HasAttribute("href") && !string.IsNullOrWhiteSpace(e.Attributes["href"]) &&
!string.Equals(e.Attributes["href"], orginalUrl, StringComparison.OrdinalIgnoreCase);
}

protected override string GetBaseHrefValue(CrawledPage crawledPage)
{
string baseTagValue = crawledPage.CsQueryDocument.Select("base").Attr("href") ?? "";
Expand All @@ -71,7 +63,15 @@ protected override string GetMetaRobotsValue(CrawledPage crawledPage)
return crawledPage.CsQueryDocument["meta[name]"].Filter(d => d.Name.ToLowerInvariant() == "robots").Attr("content");
}

private bool HasRelNoFollow(IDomElement e)
protected virtual bool HasRelCanonicalPointingToDifferentUrl(IDomElement e, string orginalUrl)
{
return e.HasAttribute("rel") && !string.IsNullOrWhiteSpace(e.Attributes["rel"]) &&
string.Equals(e.Attributes["rel"], "canonical", StringComparison.OrdinalIgnoreCase) &&
e.HasAttribute("href") && !string.IsNullOrWhiteSpace(e.Attributes["href"]) &&
!string.Equals(e.Attributes["href"], orginalUrl, StringComparison.OrdinalIgnoreCase);
}

protected virtual bool HasRelNoFollow(IDomElement e)
{
return _isRespectAnchorRelNoFollowEnabled && (e.HasAttribute("rel") && e.GetAttribute("rel").ToLower().Trim() == "nofollow");
}
Expand Down
28 changes: 14 additions & 14 deletions Abot/Core/HapHyperLinkParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,17 @@ protected override string GetBaseHrefValue(CrawledPage crawledPage)
return hrefValue;
}

private List<string> GetLinks(HtmlNodeCollection nodes)
protected override string GetMetaRobotsValue(CrawledPage crawledPage)
{
string robotsMeta = null;
HtmlNode robotsNode = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='robots']");
if (robotsNode != null)
robotsMeta = robotsNode.GetAttributeValue("content", "");

return robotsMeta;
}

protected virtual List<string> GetLinks(HtmlNodeCollection nodes)
{
List<string> hrefs = new List<string>();

Expand All @@ -81,13 +91,13 @@ private List<string> GetLinks(HtmlNodeCollection nodes)
{
hrefValue = DeEntitize(hrefValue);
hrefs.Add(hrefValue);
}
}
}

return hrefs;
}

private string DeEntitize(string hrefValue)
protected virtual string DeEntitize(string hrefValue)
{
string dentitizedHref = hrefValue;

Expand All @@ -103,17 +113,7 @@ private string DeEntitize(string hrefValue)
return dentitizedHref;
}

protected override string GetMetaRobotsValue(CrawledPage crawledPage)
{
string robotsMeta = null;
HtmlNode robotsNode = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='robots']");
if (robotsNode != null)
robotsMeta = robotsNode.GetAttributeValue("content", "");

return robotsMeta;
}

private bool HasRelNoFollow(HtmlNode node)
protected virtual bool HasRelNoFollow(HtmlNode node)
{
HtmlAttribute attr = node.Attributes["rel"];
return _isRespectAnchorRelNoFollowEnabled && (attr != null && attr.Value.ToLower().Trim() == "nofollow");
Expand Down
2 changes: 1 addition & 1 deletion Abot/Crawler/WebCrawler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -918,7 +918,7 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)

protected virtual bool ShouldSchedulePageLink(PageToCrawl page)
{
if ((page.IsInternal == true || _crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled == true) && (ShouldCrawlPage(page)))
if ((page.IsInternal || _crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled) && (ShouldCrawlPage(page)))
return true;

return false;
Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Abot is an open source C# web crawler built for speed and flexibility. It takes
* [Ask questions and search for answers on the Community Forum](http://groups.google.com/group/abot-web-crawler)
* [Report Bugs or Suggest Features](https://github.com/sjdirect/abot/issues)
* [Learn how you can contribute](https://github.com/sjdirect/abot/wiki/Contribute)
* [Get custom Abot development](https://github.com/sjdirect/abot/wiki/Custom-Development)
* [Need expert Abot customization?](https://github.com/sjdirect/abot/wiki/Custom-Development)
* [Take the usage survey](https://www.surveymonkey.com/s/JS5826F) to help prioritize features/improvements
* [Consider making a donation](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=G6ZY6BZNBFVQJ)

Expand Down Expand Up @@ -284,7 +284,7 @@ PoliteWebCrawler crawler = new PoliteWebCrawler();

crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
{
CrawlDecision decision = new CrawlDecision();
CrawlDecision decision = new CrawlDecision{ Allow = true };
if(pageToCrawl.Uri.Authority == "google.com")
return new CrawlDecision{ Allow = false, Reason = "Dont want to crawl google pages" };

Expand All @@ -293,7 +293,7 @@ crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>

crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>
{
CrawlDecision decision = new CrawlDecision();
CrawlDecision decision = new CrawlDecision{ Allow = true };
if (!crawledPage.Uri.AbsoluteUri.Contains(".com"))
return new CrawlDecision { Allow = false, Reason = "Only download raw page content for .com tlds" };

Expand All @@ -302,7 +302,7 @@ crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>

crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
{
CrawlDecision decision = new CrawlDecision();
CrawlDecision decision = new CrawlDecision{ Allow = true };
if (crawledPage.PageSizeInBytes < 100)
return new CrawlDecision { Allow = false, Reason = "Just crawl links in pages that have at least 100 bytes" };

Expand Down

0 comments on commit c35755c

Please sign in to comment.