From 68bf2989eedac063dbad33a39b7f2349b7e83d08 Mon Sep 17 00:00:00 2001
From: Steven <sjdirect@gmail.com>
Date: Sat, 14 Mar 2015 14:28:45 -0700
Subject: [PATCH 1/4] Updated readme [skip ci]

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 9050c654..0acbfeae 100644
--- a/README.md
+++ b/README.md
@@ -284,7 +284,7 @@ PoliteWebCrawler crawler = new PoliteWebCrawler();
 
 crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => 
 {
-	CrawlDecision decision = new CrawlDecision();
+	CrawlDecision decision = new CrawlDecision{ Allow = true };
 	if(pageToCrawl.Uri.Authority == "google.com")
 		return new CrawlDecision{ Allow = false, Reason = "Dont want to crawl google pages" };
 	
@@ -293,7 +293,7 @@ crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
 
 crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>
 {
-	CrawlDecision decision = new CrawlDecision();
+	CrawlDecision decision = new CrawlDecision{ Allow = true };
 	if (!crawledPage.Uri.AbsoluteUri.Contains(".com"))
 		return new CrawlDecision { Allow = false, Reason = "Only download raw page content for .com tlds" };
 
@@ -302,7 +302,7 @@ crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>
 
 crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
 {
-	CrawlDecision decision = new CrawlDecision();
+	CrawlDecision decision = new CrawlDecision{ Allow = true };
 	if (crawledPage.PageSizeInBytes < 100)
 		return new CrawlDecision { Allow = false, Reason = "Just crawl links in pages that have at least 100 bytes" };
 

From 9e44da875bd2edfe3f4d62e4086126c09ba3b2b6 Mon Sep 17 00:00:00 2001
From: Steven <sjdirect@gmail.com>
Date: Sun, 15 Mar 2015 21:26:00 -0700
Subject: [PATCH 2/4] Updated readme [skip ci]

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0acbfeae..50cd6744 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Abot is an open source C# web crawler built for speed and flexibility. It takes
   * [Ask questions and search for answers on the Community Forum](http://groups.google.com/group/abot-web-crawler)
   * [Report Bugs or Suggest Features](https://github.com/sjdirect/abot/issues)
   * [Learn how you can contribute](https://github.com/sjdirect/abot/wiki/Contribute)
-  * [Get custom Abot development](https://github.com/sjdirect/abot/wiki/Custom-Development)
+  * [Need expert Abot customization?](https://github.com/sjdirect/abot/wiki/Custom-Development)
   * [Take the usage survey](https://www.surveymonkey.com/s/JS5826F) to help prioritize features/improvements
   * [Consider making a donation](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=G6ZY6BZNBFVQJ)
 

From 2853f8dedf2255de6d868240bce46e1e3c22a931 Mon Sep 17 00:00:00 2001
From: sjdirect <sjdirect@gmail.com>
Date: Tue, 17 Mar 2015 14:01:55 -0700
Subject: [PATCH 3/4] made all hyperlinkparser methods protected virtual

---
 Abot/Core/CsQueryHyperLinkParser.cs | 18 +++++++++---------
 Abot/Core/HapHyperLinkParser.cs     | 28 ++++++++++++++--------------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/Abot/Core/CsQueryHyperLinkParser.cs b/Abot/Core/CsQueryHyperLinkParser.cs
index 579641ff..53d7ebaf 100644
--- a/Abot/Core/CsQueryHyperLinkParser.cs
+++ b/Abot/Core/CsQueryHyperLinkParser.cs
@@ -52,14 +52,6 @@ protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage)
             return hrefValues.Concat(canonicalHref);
         }
 
-        protected bool HasRelCanonicalPointingToDifferentUrl(IDomElement e, string orginalUrl)
-        {
-            return  e.HasAttribute("rel") && !string.IsNullOrWhiteSpace(e.Attributes["rel"]) &&
-                    string.Equals(e.Attributes["rel"], "canonical", StringComparison.OrdinalIgnoreCase) && 
-                    e.HasAttribute("href") && !string.IsNullOrWhiteSpace(e.Attributes["href"]) &&
-                    !string.Equals(e.Attributes["href"], orginalUrl, StringComparison.OrdinalIgnoreCase);
-        }
-
         protected override string GetBaseHrefValue(CrawledPage crawledPage)
         {
             string baseTagValue = crawledPage.CsQueryDocument.Select("base").Attr("href") ?? "";
@@ -71,7 +63,15 @@ protected override string GetMetaRobotsValue(CrawledPage crawledPage)
             return crawledPage.CsQueryDocument["meta[name]"].Filter(d => d.Name.ToLowerInvariant() == "robots").Attr("content");
         }
 
-        private bool HasRelNoFollow(IDomElement e)
+        protected virtual bool HasRelCanonicalPointingToDifferentUrl(IDomElement e, string orginalUrl)
+        {
+            return e.HasAttribute("rel") && !string.IsNullOrWhiteSpace(e.Attributes["rel"]) &&
+                    string.Equals(e.Attributes["rel"], "canonical", StringComparison.OrdinalIgnoreCase) &&
+                    e.HasAttribute("href") && !string.IsNullOrWhiteSpace(e.Attributes["href"]) &&
+                    !string.Equals(e.Attributes["href"], orginalUrl, StringComparison.OrdinalIgnoreCase);
+        }
+
+        protected virtual bool HasRelNoFollow(IDomElement e)
         {
             return _isRespectAnchorRelNoFollowEnabled && (e.HasAttribute("rel") && e.GetAttribute("rel").ToLower().Trim() == "nofollow");
         }
diff --git a/Abot/Core/HapHyperLinkParser.cs b/Abot/Core/HapHyperLinkParser.cs
index aa8df2ed..990c0a5a 100644
--- a/Abot/Core/HapHyperLinkParser.cs
+++ b/Abot/Core/HapHyperLinkParser.cs
@@ -63,7 +63,17 @@ protected override string GetBaseHrefValue(CrawledPage crawledPage)
             return hrefValue;
         }
 
-        private List<string> GetLinks(HtmlNodeCollection nodes)
+        protected override string GetMetaRobotsValue(CrawledPage crawledPage)
+        {
+            string robotsMeta = null;
+            HtmlNode robotsNode = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='robots']");
+            if (robotsNode != null)
+                robotsMeta = robotsNode.GetAttributeValue("content", "");
+
+            return robotsMeta;
+        }
+
+        protected virtual List<string> GetLinks(HtmlNodeCollection nodes)
         {
             List<string> hrefs = new List<string>();
 
@@ -81,13 +91,13 @@ private List<string> GetLinks(HtmlNodeCollection nodes)
                 {
                     hrefValue = DeEntitize(hrefValue);
                     hrefs.Add(hrefValue);
-                }	
+                }
             }
 
             return hrefs;
         }
 
-        private string DeEntitize(string hrefValue)
+        protected virtual string DeEntitize(string hrefValue)
         {
             string dentitizedHref = hrefValue;
             
@@ -103,17 +113,7 @@ private string DeEntitize(string hrefValue)
             return dentitizedHref;
         }
 
-        protected override string GetMetaRobotsValue(CrawledPage crawledPage)
-        {
-            string robotsMeta = null;
-            HtmlNode robotsNode = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='robots']");
-            if (robotsNode != null)
-                robotsMeta = robotsNode.GetAttributeValue("content", "");
-
-            return robotsMeta;
-        }
-
-        private bool HasRelNoFollow(HtmlNode node)
+        protected virtual bool HasRelNoFollow(HtmlNode node)
         {
             HtmlAttribute attr = node.Attributes["rel"];
             return _isRespectAnchorRelNoFollowEnabled && (attr != null && attr.Value.ToLower().Trim() == "nofollow");

From cf8f359ec48655ec59d2d3e1d3ad6f1036275c78 Mon Sep 17 00:00:00 2001
From: sjdirect <sjdirect@gmail.com>
Date: Sat, 21 Mar 2015 14:04:28 -0700
Subject: [PATCH 4/4] [skip ci]

---
 Abot/Crawler/WebCrawler.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Abot/Crawler/WebCrawler.cs b/Abot/Crawler/WebCrawler.cs
index f989f8a3..745a024e 100644
--- a/Abot/Crawler/WebCrawler.cs
+++ b/Abot/Crawler/WebCrawler.cs
@@ -918,7 +918,7 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
 
         protected virtual bool ShouldSchedulePageLink(PageToCrawl page)
         {
-            if ((page.IsInternal == true || _crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled == true) && (ShouldCrawlPage(page)))
+            if ((page.IsInternal || _crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled) && (ShouldCrawlPage(page)))
                 return true;
 
             return false;