Skip to content

Commit

Permalink
added more test cases, fixed source and article bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
codelucas committed Nov 29, 2013
1 parent 657ed20 commit 13c2e77
Show file tree
Hide file tree
Showing 11 changed files with 100,238 additions and 145 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

.DS_Store # pycharm
.idea # pycharm
1M_urls.txt

# C extensions
*.so
Expand Down
9 changes: 0 additions & 9 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,10 @@ Newspaper: Article scraping & curation
:target: http://badge.fury.io/py/textblob
:alt: Latest version

.. image:: https://travis-ci.org/sloria/TextBlob.png?branch=master
:target: https://travis-ci.org/sloria/TextBlob
:alt: Travis-CI

.. image:: https://pypip.in/d/textblob/badge.png
:target: https://crate.io/packages/textblob/
:alt: Number of PyPI downloads

.. image:: https://badge.waffle.io/sloria/TextBlob.png?label=Ready
:target: https://waffle.io/sloria/TextBlob
:alt: Issues in Ready


Homepage: `https://textblob.readthedocs.org/ <https://textblob.readthedocs.org/>`_

Expand Down Expand Up @@ -63,7 +55,6 @@ Newspaper utilizes async io and caching for speed. Everything is in unicode :)
print cnn_paper.brand
# u'cnn'
# Alternatively, you can use newspaper's lower level Article api
from newspaper import Article
Expand Down
7 changes: 3 additions & 4 deletions newspaper/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

class Article(object):

def __init__(self, url, title=u'', source_url=None, from_feed=False):
def __init__(self, url, title=u'', source_url=None):
if source_url is None:
source_url = get_scheme(url) + '://' + get_domain(url)

Expand All @@ -45,7 +45,6 @@ def __init__(self, url, title=u'', source_url=None, from_feed=False):
self.domain = get_domain(source_url)
self.scheme = get_scheme(source_url)
self.rejected = False
self.from_feed = from_feed

self.html = u''
self.lxml_root = None
Expand All @@ -54,8 +53,8 @@ def __init__(self, url, title=u'', source_url=None, from_feed=False):

# If a url is from a feed, we know it's pre-validated,
# otherwise, we need to make sure its a news article.
if not from_feed:
self.verify_url()
# if not from_feed: TODO Once we figure out feedparser again, restore this
self.verify_url()

def build(self):
"""build a lone article from a url independent of the
Expand Down
21 changes: 19 additions & 2 deletions newspaper/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@

import logging
import requests
import grequests

from .settings import cj, USERAGENT

log = logging.getLogger(__name__)

def get_html(url, timeout=7):
"""downloads the html of a url"""
def get_html(url, response=None, timeout=7):
"""retrieves the html for either a url or a response object"""

if response is not None:
return response.text
try:
req_kwargs = {
'headers' : {'User-Agent': USERAGENT},
Expand All @@ -24,3 +27,17 @@ def get_html(url, timeout=7):
except Exception, e:
log.debug('%s on %s' % (e, url))
return u''

def async_request(urls, timeout=7):
"""receives a list of requests and sends them all
asynchronously at once"""

req_kwargs = {
'headers' : {'User-Agent': USERAGENT},
'cookies' : cj(),
'timeout' : timeout,
'allow_redirects' : True
}
rs = (grequests.request('GET', url, **req_kwargs) for url in urls)
responses = grequests.map(rs) # send all requests at once async
return responses
24 changes: 16 additions & 8 deletions newspaper/parsers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-

import re
import logging
import urlparse

Expand Down Expand Up @@ -44,18 +45,25 @@ def get_lxml_root(html):
print str(e)
return None

def get_urls(root_or_html, titles=True):
def get_urls(_input, titles=True, istext=False):
"""returns a list of urls on the html page or lxml_root"""

if root_or_html is None:
log.critical('Must extract urls from either html or lxml_root!')
if _input is None:
log.critical('Must extract urls from either html, text or lxml_root!')
return []

# If we are extracting from raw text
if istext:
_input = re.sub('<[^<]+?>', ' ', _input)
_input = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', _input)
_input = [i.strip() for i in _input]
return _input or []

# If the input is html, parse it into a root
if isinstance(root_or_html, str) or isinstance(root_or_html, unicode):
lxml_root = get_lxml_root(root_or_html)
if isinstance(_input, str) or isinstance(_input, unicode):
lxml_root = get_lxml_root(_input)
else:
lxml_root = root_or_html
lxml_root = _input

if lxml_root is None:
return []
Expand Down Expand Up @@ -169,8 +177,8 @@ def get_feed_urls(source):
we extract category urls first and then feeds"""

feed_urls = []
for category_obj in source.category_objs:
root = category_obj[2]
for category in source.categories:
root = category.lxml_root
feed_urls.extend(root.xpath('//*[@type="application/rss+xml"]/@href'))

feed_urls = feed_urls[:50]
Expand Down
Loading

0 comments on commit 13c2e77

Please sign in to comment.