added more test cases, fixed source and article bugs

RobBuddha · Nov 29, 2013 · 13c2e77 · 13c2e77
1 parent 657ed20
commit 13c2e77
Show file tree

Hide file tree

Showing 11 changed files with 100,238 additions and 145 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 
 .DS_Store # pycharm
 .idea # pycharm
+1M_urls.txt
 
 # C extensions
 *.so

diff --git a/README.rst b/README.rst
@@ -5,18 +5,10 @@ Newspaper: Article scraping & curation
     :target: http://badge.fury.io/py/textblob
         :alt: Latest version
 
-.. image:: https://travis-ci.org/sloria/TextBlob.png?branch=master
-    :target: https://travis-ci.org/sloria/TextBlob
-        :alt: Travis-CI
-
 .. image:: https://pypip.in/d/textblob/badge.png
     :target: https://crate.io/packages/textblob/
         :alt: Number of PyPI downloads
 
-.. image:: https://badge.waffle.io/sloria/TextBlob.png?label=Ready
-    :target: https://waffle.io/sloria/TextBlob
-        :alt: Issues in Ready
-
 
 Homepage: `https://textblob.readthedocs.org/ <https://textblob.readthedocs.org/>`_
 
@@ -63,7 +55,6 @@ Newspaper utilizes async io and caching for speed. Everything is in unicode :)
     print cnn_paper.brand
     # u'cnn'
 
-
     # Alternatively, you can use newspaper's lower level Article api
 
     from newspaper import Article

diff --git a/newspaper/article.py b/newspaper/article.py
@@ -22,7 +22,7 @@
 
 class Article(object):
 
-    def __init__(self, url, title=u'', source_url=None, from_feed=False):
+    def __init__(self, url, title=u'', source_url=None):
         if source_url is None:
             source_url = get_scheme(url) + '://' + get_domain(url)
 
@@ -45,7 +45,6 @@ def __init__(self, url, title=u'', source_url=None, from_feed=False):
         self.domain = get_domain(source_url)
         self.scheme = get_scheme(source_url)
         self.rejected = False
-        self.from_feed = from_feed
 
         self.html = u''
         self.lxml_root = None
@@ -54,8 +53,8 @@ def __init__(self, url, title=u'', source_url=None, from_feed=False):
 
         # If a url is from a feed, we know it's pre-validated,
         # otherwise, we need to make sure its a news article.
-        if not from_feed:
-            self.verify_url()
+        # if not from_feed: TODO Once we figure out feedparser again, restore this
+        self.verify_url()
 
     def build(self):
         """build a lone article from a url independent of the

diff --git a/newspaper/network.py b/newspaper/network.py
@@ -2,14 +2,17 @@
 
 import logging
 import requests
+import grequests
 
 from .settings import cj, USERAGENT
 
 log = logging.getLogger(__name__)
 
-def get_html(url, timeout=7):
-    """downloads the html of a url"""
+def get_html(url, response=None, timeout=7):
+    """retrieves the html for either a url or a response object"""
 
+    if response is not None:
+        return response.text
     try:
         req_kwargs = {
             'headers' : {'User-Agent': USERAGENT},
@@ -24,3 +27,17 @@ def get_html(url, timeout=7):
     except Exception, e:
         log.debug('%s on %s' % (e, url))
         return u''
+
+def async_request(urls, timeout=7):
+    """receives a list of requests and sends them all
+    asynchronously at once"""
+
+    req_kwargs = {
+        'headers' : {'User-Agent': USERAGENT},
+        'cookies' : cj(),
+        'timeout' : timeout,
+        'allow_redirects' : True
+    }
+    rs = (grequests.request('GET', url, **req_kwargs) for url in urls)
+    responses = grequests.map(rs) # send all requests at once async
+    return responses
diff --git a/newspaper/parsers.py b/newspaper/parsers.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 
+import re
 import logging
 import urlparse
 
@@ -44,18 +45,25 @@ def get_lxml_root(html):
         print str(e)
         return None
 
-def get_urls(root_or_html, titles=True):
+def get_urls(_input, titles=True, istext=False):
     """returns a list of urls on the html page or lxml_root"""
 
-    if root_or_html is None:
-        log.critical('Must extract urls from either html or lxml_root!')
+    if _input is None:
+        log.critical('Must extract urls from either html, text or lxml_root!')
         return []
 
+    # If we are extracting from raw text
+    if istext:
+        _input = re.sub('<[^<]+?>', ' ', _input)
+        _input = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', _input)
+        _input = [i.strip() for i in _input]
+        return _input or []
+
     # If the input is html, parse it into a root
-    if isinstance(root_or_html, str) or isinstance(root_or_html, unicode):
-        lxml_root = get_lxml_root(root_or_html)
+    if isinstance(_input, str) or isinstance(_input, unicode):
+        lxml_root = get_lxml_root(_input)
     else:
-        lxml_root = root_or_html
+        lxml_root = _input
 
     if lxml_root is None:
         return []
@@ -169,8 +177,8 @@ def get_feed_urls(source):
     we extract category urls first and then feeds"""
 
     feed_urls = []
-    for category_obj in source.category_objs:
-        root = category_obj[2]
+    for category in source.categories:
+        root = category.lxml_root
         feed_urls.extend(root.xpath('//*[@type="application/rss+xml"]/@href'))
 
     feed_urls = feed_urls[:50]