Merge branch 'master' of github.com:sjdirect/abot

IamTonyZHOU · Mar 24, 2015 · c35755c · c35755c
2 parents b64afb3 + cf8f359
commit c35755c
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 28 deletions.
diff --git a/Abot/Core/CsQueryHyperLinkParser.cs b/Abot/Core/CsQueryHyperLinkParser.cs
@@ -52,14 +52,6 @@ protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage)
             return hrefValues.Concat(canonicalHref);
         }
 
-        protected bool HasRelCanonicalPointingToDifferentUrl(IDomElement e, string orginalUrl)
-        {
-            return  e.HasAttribute("rel") && !string.IsNullOrWhiteSpace(e.Attributes["rel"]) &&
-                    string.Equals(e.Attributes["rel"], "canonical", StringComparison.OrdinalIgnoreCase) && 
-                    e.HasAttribute("href") && !string.IsNullOrWhiteSpace(e.Attributes["href"]) &&
-                    !string.Equals(e.Attributes["href"], orginalUrl, StringComparison.OrdinalIgnoreCase);
-        }
-
         protected override string GetBaseHrefValue(CrawledPage crawledPage)
         {
             string baseTagValue = crawledPage.CsQueryDocument.Select("base").Attr("href") ?? "";
@@ -71,7 +63,15 @@ protected override string GetMetaRobotsValue(CrawledPage crawledPage)
             return crawledPage.CsQueryDocument["meta[name]"].Filter(d => d.Name.ToLowerInvariant() == "robots").Attr("content");
         }
 
-        private bool HasRelNoFollow(IDomElement e)
+        protected virtual bool HasRelCanonicalPointingToDifferentUrl(IDomElement e, string orginalUrl)
+        {
+            return e.HasAttribute("rel") && !string.IsNullOrWhiteSpace(e.Attributes["rel"]) &&
+                    string.Equals(e.Attributes["rel"], "canonical", StringComparison.OrdinalIgnoreCase) &&
+                    e.HasAttribute("href") && !string.IsNullOrWhiteSpace(e.Attributes["href"]) &&
+                    !string.Equals(e.Attributes["href"], orginalUrl, StringComparison.OrdinalIgnoreCase);
+        }
+
+        protected virtual bool HasRelNoFollow(IDomElement e)
         {
             return _isRespectAnchorRelNoFollowEnabled && (e.HasAttribute("rel") && e.GetAttribute("rel").ToLower().Trim() == "nofollow");
         }

diff --git a/Abot/Core/HapHyperLinkParser.cs b/Abot/Core/HapHyperLinkParser.cs
@@ -63,7 +63,17 @@ protected override string GetBaseHrefValue(CrawledPage crawledPage)
             return hrefValue;
         }
 
-        private List<string> GetLinks(HtmlNodeCollection nodes)
+        protected override string GetMetaRobotsValue(CrawledPage crawledPage)
+        {
+            string robotsMeta = null;
+            HtmlNode robotsNode = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='robots']");
+            if (robotsNode != null)
+                robotsMeta = robotsNode.GetAttributeValue("content", "");
+
+            return robotsMeta;
+        }
+
+        protected virtual List<string> GetLinks(HtmlNodeCollection nodes)
         {
             List<string> hrefs = new List<string>();
 
@@ -81,13 +91,13 @@ private List<string> GetLinks(HtmlNodeCollection nodes)
                 {
                     hrefValue = DeEntitize(hrefValue);
                     hrefs.Add(hrefValue);
-                }	
+                }
             }
 
             return hrefs;
         }
 
-        private string DeEntitize(string hrefValue)
+        protected virtual string DeEntitize(string hrefValue)
         {
             string dentitizedHref = hrefValue;
 
@@ -103,17 +113,7 @@ private string DeEntitize(string hrefValue)
             return dentitizedHref;
         }
 
-        protected override string GetMetaRobotsValue(CrawledPage crawledPage)
-        {
-            string robotsMeta = null;
-            HtmlNode robotsNode = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='robots']");
-            if (robotsNode != null)
-                robotsMeta = robotsNode.GetAttributeValue("content", "");
-
-            return robotsMeta;
-        }
-
-        private bool HasRelNoFollow(HtmlNode node)
+        protected virtual bool HasRelNoFollow(HtmlNode node)
         {
             HtmlAttribute attr = node.Attributes["rel"];
             return _isRespectAnchorRelNoFollowEnabled && (attr != null && attr.Value.ToLower().Trim() == "nofollow");

diff --git a/Abot/Crawler/WebCrawler.cs b/Abot/Crawler/WebCrawler.cs
@@ -918,7 +918,7 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
 
         protected virtual bool ShouldSchedulePageLink(PageToCrawl page)
         {
-            if ((page.IsInternal == true || _crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled == true) && (ShouldCrawlPage(page)))
+            if ((page.IsInternal || _crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled) && (ShouldCrawlPage(page)))
                 return true;
 
             return false;   

diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ Abot is an open source C# web crawler built for speed and flexibility. It takes
   * [Ask questions and search for answers on the Community Forum](http://groups.google.com/group/abot-web-crawler)
   * [Report Bugs or Suggest Features](https://github.com/sjdirect/abot/issues)
   * [Learn how you can contribute](https://github.com/sjdirect/abot/wiki/Contribute)
-  * [Get custom Abot development](https://github.com/sjdirect/abot/wiki/Custom-Development)
+  * [Need expert Abot customization?](https://github.com/sjdirect/abot/wiki/Custom-Development)
   * [Take the usage survey](https://www.surveymonkey.com/s/JS5826F) to help prioritize features/improvements
   * [Consider making a donation](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=G6ZY6BZNBFVQJ)
 
@@ -284,7 +284,7 @@ PoliteWebCrawler crawler = new PoliteWebCrawler();
 
 crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => 
 {
-	CrawlDecision decision = new CrawlDecision();
+	CrawlDecision decision = new CrawlDecision{ Allow = true };
 	if(pageToCrawl.Uri.Authority == "google.com")
 		return new CrawlDecision{ Allow = false, Reason = "Dont want to crawl google pages" };
 
@@ -293,7 +293,7 @@ crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
 
 crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>
 {
-	CrawlDecision decision = new CrawlDecision();
+	CrawlDecision decision = new CrawlDecision{ Allow = true };
 	if (!crawledPage.Uri.AbsoluteUri.Contains(".com"))
 		return new CrawlDecision { Allow = false, Reason = "Only download raw page content for .com tlds" };
 
@@ -302,7 +302,7 @@ crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>
 
 crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
 {
-	CrawlDecision decision = new CrawlDecision();
+	CrawlDecision decision = new CrawlDecision{ Allow = true };
 	if (crawledPage.PageSizeInBytes < 100)
 		return new CrawlDecision { Allow = false, Reason = "Just crawl links in pages that have at least 100 bytes" };