From 68bf2989eedac063dbad33a39b7f2349b7e83d08 Mon Sep 17 00:00:00 2001 From: Steven Date: Sat, 14 Mar 2015 14:28:45 -0700 Subject: [PATCH 1/4] Updated readme [skip ci] --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9050c654..0acbfeae 100644 --- a/README.md +++ b/README.md @@ -284,7 +284,7 @@ PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { - CrawlDecision decision = new CrawlDecision(); + CrawlDecision decision = new CrawlDecision{ Allow = true }; if(pageToCrawl.Uri.Authority == "google.com") return new CrawlDecision{ Allow = false, Reason = "Dont want to crawl google pages" }; @@ -293,7 +293,7 @@ crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => crawler.ShouldDownloadPageContent((crawledPage, crawlContext) => { - CrawlDecision decision = new CrawlDecision(); + CrawlDecision decision = new CrawlDecision{ Allow = true }; if (!crawledPage.Uri.AbsoluteUri.Contains(".com")) return new CrawlDecision { Allow = false, Reason = "Only download raw page content for .com tlds" }; @@ -302,7 +302,7 @@ crawler.ShouldDownloadPageContent((crawledPage, crawlContext) => crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) => { - CrawlDecision decision = new CrawlDecision(); + CrawlDecision decision = new CrawlDecision{ Allow = true }; if (crawledPage.PageSizeInBytes < 100) return new CrawlDecision { Allow = false, Reason = "Just crawl links in pages that have at least 100 bytes" }; From 9e44da875bd2edfe3f4d62e4086126c09ba3b2b6 Mon Sep 17 00:00:00 2001 From: Steven Date: Sun, 15 Mar 2015 21:26:00 -0700 Subject: [PATCH 2/4] Updated readme [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0acbfeae..50cd6744 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Abot is an open source C# web crawler built for speed and flexibility. It takes * [Ask questions and search for answers on the Community Forum](http://groups.google.com/group/abot-web-crawler) * [Report Bugs or Suggest Features](https://github.com/sjdirect/abot/issues) * [Learn how you can contribute](https://github.com/sjdirect/abot/wiki/Contribute) - * [Get custom Abot development](https://github.com/sjdirect/abot/wiki/Custom-Development) + * [Need expert Abot customization?](https://github.com/sjdirect/abot/wiki/Custom-Development) * [Take the usage survey](https://www.surveymonkey.com/s/JS5826F) to help prioritize features/improvements * [Consider making a donation](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=G6ZY6BZNBFVQJ) From 2853f8dedf2255de6d868240bce46e1e3c22a931 Mon Sep 17 00:00:00 2001 From: sjdirect Date: Tue, 17 Mar 2015 14:01:55 -0700 Subject: [PATCH 3/4] made all hyperlinkparser methods protected virtual --- Abot/Core/CsQueryHyperLinkParser.cs | 18 +++++++++--------- Abot/Core/HapHyperLinkParser.cs | 28 ++++++++++++++-------------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/Abot/Core/CsQueryHyperLinkParser.cs b/Abot/Core/CsQueryHyperLinkParser.cs index 579641ff..53d7ebaf 100644 --- a/Abot/Core/CsQueryHyperLinkParser.cs +++ b/Abot/Core/CsQueryHyperLinkParser.cs @@ -52,14 +52,6 @@ protected override IEnumerable GetHrefValues(CrawledPage crawledPage) return hrefValues.Concat(canonicalHref); } - protected bool HasRelCanonicalPointingToDifferentUrl(IDomElement e, string orginalUrl) - { - return e.HasAttribute("rel") && !string.IsNullOrWhiteSpace(e.Attributes["rel"]) && - string.Equals(e.Attributes["rel"], "canonical", StringComparison.OrdinalIgnoreCase) && - e.HasAttribute("href") && !string.IsNullOrWhiteSpace(e.Attributes["href"]) && - !string.Equals(e.Attributes["href"], orginalUrl, StringComparison.OrdinalIgnoreCase); - } - protected override string GetBaseHrefValue(CrawledPage crawledPage) { string baseTagValue = crawledPage.CsQueryDocument.Select("base").Attr("href") ?? ""; @@ -71,7 +63,15 @@ protected override string GetMetaRobotsValue(CrawledPage crawledPage) return crawledPage.CsQueryDocument["meta[name]"].Filter(d => d.Name.ToLowerInvariant() == "robots").Attr("content"); } - private bool HasRelNoFollow(IDomElement e) + protected virtual bool HasRelCanonicalPointingToDifferentUrl(IDomElement e, string orginalUrl) + { + return e.HasAttribute("rel") && !string.IsNullOrWhiteSpace(e.Attributes["rel"]) && + string.Equals(e.Attributes["rel"], "canonical", StringComparison.OrdinalIgnoreCase) && + e.HasAttribute("href") && !string.IsNullOrWhiteSpace(e.Attributes["href"]) && + !string.Equals(e.Attributes["href"], orginalUrl, StringComparison.OrdinalIgnoreCase); + } + + protected virtual bool HasRelNoFollow(IDomElement e) { return _isRespectAnchorRelNoFollowEnabled && (e.HasAttribute("rel") && e.GetAttribute("rel").ToLower().Trim() == "nofollow"); } diff --git a/Abot/Core/HapHyperLinkParser.cs b/Abot/Core/HapHyperLinkParser.cs index aa8df2ed..990c0a5a 100644 --- a/Abot/Core/HapHyperLinkParser.cs +++ b/Abot/Core/HapHyperLinkParser.cs @@ -63,7 +63,17 @@ protected override string GetBaseHrefValue(CrawledPage crawledPage) return hrefValue; } - private List GetLinks(HtmlNodeCollection nodes) + protected override string GetMetaRobotsValue(CrawledPage crawledPage) + { + string robotsMeta = null; + HtmlNode robotsNode = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='robots']"); + if (robotsNode != null) + robotsMeta = robotsNode.GetAttributeValue("content", ""); + + return robotsMeta; + } + + protected virtual List GetLinks(HtmlNodeCollection nodes) { List hrefs = new List(); @@ -81,13 +91,13 @@ private List GetLinks(HtmlNodeCollection nodes) { hrefValue = DeEntitize(hrefValue); hrefs.Add(hrefValue); - } + } } return hrefs; } - private string DeEntitize(string hrefValue) + protected virtual string DeEntitize(string hrefValue) { string dentitizedHref = hrefValue; @@ -103,17 +113,7 @@ private string DeEntitize(string hrefValue) return dentitizedHref; } - protected override string GetMetaRobotsValue(CrawledPage crawledPage) - { - string robotsMeta = null; - HtmlNode robotsNode = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='robots']"); - if (robotsNode != null) - robotsMeta = robotsNode.GetAttributeValue("content", ""); - - return robotsMeta; - } - - private bool HasRelNoFollow(HtmlNode node) + protected virtual bool HasRelNoFollow(HtmlNode node) { HtmlAttribute attr = node.Attributes["rel"]; return _isRespectAnchorRelNoFollowEnabled && (attr != null && attr.Value.ToLower().Trim() == "nofollow"); From cf8f359ec48655ec59d2d3e1d3ad6f1036275c78 Mon Sep 17 00:00:00 2001 From: sjdirect Date: Sat, 21 Mar 2015 14:04:28 -0700 Subject: [PATCH 4/4] [skip ci] --- Abot/Crawler/WebCrawler.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Abot/Crawler/WebCrawler.cs b/Abot/Crawler/WebCrawler.cs index f989f8a3..745a024e 100644 --- a/Abot/Crawler/WebCrawler.cs +++ b/Abot/Crawler/WebCrawler.cs @@ -918,7 +918,7 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage) protected virtual bool ShouldSchedulePageLink(PageToCrawl page) { - if ((page.IsInternal == true || _crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled == true) && (ShouldCrawlPage(page))) + if ((page.IsInternal || _crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled) && (ShouldCrawlPage(page))) return true; return false;