# -*- coding: utf-8 -*- """ All unit tests for the newspaper library should be contained in this file. """ import sys import os import unittest import time import traceback from collections import defaultdict, OrderedDict import concurrent.futures TEST_DIR = os.path.abspath(os.path.dirname(__file__)) PARENT_DIR = os.path.join(TEST_DIR, '..') # newspaper's unit tests are in their own separate module, so # insert the parent directory manually to gain scope of the # core module sys.path.insert(0, PARENT_DIR) TEXT_FN = os.path.join(TEST_DIR, 'data', 'text') HTML_FN = os.path.join(TEST_DIR, 'data', 'html') URLS_FILE = os.path.join(TEST_DIR, 'data', 'fulltext_url_list.txt') import newspaper from newspaper import Article, fulltext, Source, ArticleException, news_pool from newspaper.configuration import Configuration from newspaper.urls import get_domain def print_test(method): """ Utility method for print verbalizing test suite, prints out time taken for test and functions name, and status """ def run(*args, **kw): ts = time.time() print('\ttesting function %r' % method.__name__) method(*args, **kw) te = time.time() print('\t[OK] in %r %2.2f sec' % (method.__name__, te - ts)) return run def mock_resource_with(filename, resource_type): """ Mocks an HTTP request by pulling text from a pre-downloaded file """ VALID_RESOURCES = ['html', 'txt'] if resource_type not in VALID_RESOURCES: raise Exception('Mocked resource must be one of: %s' % ', '.join(VALID_RESOURCES)) subfolder = 'text' if resource_type == 'txt' else 'html' resource_path = os.path.join(TEST_DIR, "data/%s/%s.%s" % (subfolder, filename, resource_type)) with open(resource_path, 'r') as f: return f.read() def get_base_domain(url): """ For example, the base url of uk.reuters.com => reuters.com """ domain = get_domain(url) tld = '.'.join(domain.split('.')[-2:]) if tld in ['co.uk', 'com.au', 'au.com']: # edge cases end_chunks = domain.split('.')[-3:] else: end_chunks = domain.split('.')[-2:] base_domain = '.'.join(end_chunks) return base_domain def check_url(*args, **kwargs): return ExhaustiveFullTextCase.check_url(*args, **kwargs) @unittest.skipIf('fulltext' not in sys.argv, 'Skipping fulltext tests') class ExhaustiveFullTextCase(unittest.TestCase): @staticmethod def check_url(args): """ :param (basestr, basestr) url, res_filename: :return: (pubdate_failed, fulltext_failed) """ url, res_filename = args pubdate_failed, fulltext_failed = False, False html = mock_resource_with(res_filename, 'html') try: a = Article(url) a.download(html) a.parse() if a.publish_date is None: pubdate_failed = True except Exception: print('<< URL: %s parse ERROR >>' % url) traceback.print_exc() pubdate_failed, fulltext_failed = True, True else: correct_text = mock_resource_with(res_filename, 'txt') if not (a.text == correct_text): # print('Diff: ', simplediff.diff(correct_text, a.text)) # `correct_text` holds the reason of failure if failure print('%s -- %s -- %s' % ('Fulltext failed', res_filename, correct_text.strip())) fulltext_failed = True # TODO: assert statements are commented out for full-text # extraction tests because we are constantly tweaking the # algorithm and improving # assert a.text == correct_text return pubdate_failed, fulltext_failed @print_test def test_exhaustive(self): with open(URLS_FILE, 'r') as f: urls = [d.strip() for d in f.readlines() if d.strip()] domain_counters = {} def get_filename(url): domain = get_base_domain(url) domain_counters[domain] = domain_counters.get(domain, 0) + 1 return '{}{}'.format(domain, domain_counters[domain]) filenames = map(get_filename, urls) with concurrent.futures.ProcessPoolExecutor() as executor: test_results = list(executor.map(check_url, zip(urls, filenames))) total_pubdates_failed, total_fulltext_failed = \ list(map(sum, zip(*test_results))) print('%s fulltext extractions failed out of %s' % (total_fulltext_failed, len(urls))) print('%s pubdate extractions failed out of %s' % (total_pubdates_failed, len(urls))) self.assertGreaterEqual(47, total_pubdates_failed) self.assertGreaterEqual(20, total_fulltext_failed) class ArticleTestCase(unittest.TestCase): def setup_stage(self, stage_name): stages = OrderedDict([ ('initial', lambda: None), ('download', lambda: self.article.download( mock_resource_with('cnn_article', 'html'))), ('parse', lambda: self.article.parse()), ('meta', lambda: None), # Alias for nlp ('nlp', lambda: self.article.nlp()) ]) assert stage_name in stages for name, action in stages.items(): if name == stage_name: break action() def setUp(self): """Called before the first test case of this unit begins """ self.article = Article( url='http://www.cnn.com/2013/11/27/travel/weather-' 'thanksgiving/index.html?iref=allsearch') @print_test def test_url(self): self.assertEqual( 'http://www.cnn.com/2013/11/27/travel/weather-' 'thanksgiving/index.html?iref=allsearch', self.article.url) @print_test def test_download_html(self): self.setup_stage('download') html = mock_resource_with('cnn_article', 'html') self.article.download(html) self.assertEqual(75406, len(self.article.html)) @print_test def test_meta_refresh_redirect(self): # TODO: We actually hit example.com in this unit test ... which is bad # Figure out how to mock an actual redirect config = Configuration() config.follow_meta_refresh = True article = Article( '', config=config) html = mock_resource_with('google_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'Example Domain') @print_test def test_meta_refresh_no_url_redirect(self): config = Configuration() config.follow_meta_refresh = True article = Article( '', config=config) html = mock_resource_with('ap_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'News from The Associated Press') @print_test def test_pre_download_parse(self): """Calling `parse()` before `download()` should yield an error """ article = Article(self.article.url) self.assertRaises(ArticleException, article.parse) @print_test def test_parse_html(self): self.setup_stage('parse') AUTHORS = ['Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey', 'Tom Watkins'] TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving' LEN_IMGS = 46 META_LANG = 'en' self.article.parse() self.article.nlp() text = mock_resource_with('cnn', 'txt') self.assertEqual(text, self.article.text) self.assertEqual(text, fulltext(self.article.html)) # NOTE: top_img extraction requires an internet connection # unlike the rest of this test file TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-' '01-weather-1128-story-top.jpg') self.assertEqual(TOP_IMG, self.article.top_img) self.assertCountEqual(AUTHORS, self.article.authors) self.assertEqual(TITLE, self.article.title) self.assertEqual(LEN_IMGS, len(self.article.imgs)) self.assertEqual(META_LANG, self.article.meta_lang) self.assertEqual('2013-11-27 00:00:00', str(self.article.publish_date)) @print_test def test_meta_type_extraction(self): self.setup_stage('meta') meta_type = self.article.extractor.get_meta_type( self.article.clean_doc) self.assertEqual('article', meta_type) @print_test def test_meta_extraction(self): self.setup_stage('meta') meta = self.article.extractor.get_meta_data(self.article.clean_doc) META_DATA = defaultdict(dict, { 'medium': 'news', 'googlebot': 'noarchive', 'pubdate': '2013-11-27T08:36:32Z', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com', 'og': {'site_name': 'CNN', 'description': 'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving', 'url': 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html', 'image': 'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg', 'type': 'article'}, 'section': 'travel', 'author': 'Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN', 'robots': 'index,follow', 'vr': { 'canonical': 'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'}, 'source': 'CNN', 'fb': {'page_id': 18793419640, 'app_id': 80401312489}, 'keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm', 'article': { 'publisher': 'https://www.facebook.com/cnninternational'}, 'lastmod': '2013-11-28T02:03:23Z', 'twitter': {'site': {'identifier': '@CNNI', 'id': 2097571}, 'card': 'summary', 'creator': {'identifier': '@cnntravel', 'id': 174377718}}, 'viewport': 'width=1024', 'news_keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm' }) self.assertDictEqual(META_DATA, meta) # if the value for a meta key is another dict, that dict ought to be # filled with keys and values dict_values = [v for v in list(meta.values()) if isinstance(v, dict)] self.assertTrue(all([len(d) > 0 for d in dict_values])) # there are exactly 5 top-level "og:type" type keys is_dict = lambda v: isinstance(v, dict) self.assertEqual(5, len([i for i in meta.values() if is_dict(i)])) # there are exactly 12 top-level "pubdate" type keys is_string = lambda v: isinstance(v, str) self.assertEqual(12, len([i for i in meta.values() if is_string(i)])) @print_test def test_pre_download_nlp(self): """Test running NLP algos before even downloading the article """ self.setup_stage('initial') new_article = Article(self.article.url) self.assertRaises(ArticleException, new_article.nlp) @print_test def test_pre_parse_nlp(self): """Test running NLP algos before parsing the article """ self.setup_stage('parse') self.assertRaises(ArticleException, self.article.nlp) @print_test def test_nlp_body(self): self.setup_stage('nlp') self.article.nlp() KEYWORDS = ['balloons', 'delays', 'flight', 'forecasters', 'good', 'sailing', 'smooth', 'storm', 'thanksgiving', 'travel', 'weather', 'winds', 'york'] SUMMARY = mock_resource_with('cnn_summary', 'txt') self.assertEqual(SUMMARY, self.article.summary) self.assertCountEqual(KEYWORDS, self.article.keywords) class ContentExtractorTestCase(unittest.TestCase): """Test specific element extraction cases""" def setUp(self): self.extractor = newspaper.extractors.ContentExtractor(Configuration()) self.parser = newspaper.parsers.Parser def _get_title(self, html): doc = self.parser.fromstring(html) return self.extractor.get_title(doc) def test_get_title_basic(self): html = 'Test title' self.assertEqual(self._get_title(html), 'Test title') def test_get_title_split(self): html = 'Test page » Test title' self.assertEqual(self._get_title(html), 'Test title') def test_get_title_split_escaped(self): html = 'Test page » Test title' self.assertEqual(self._get_title(html), 'Test title') def test_get_title_quotes(self): title = 'Test page and «something in quotes»' html = '{}'.format(title) self.assertEqual(self._get_title(html), title) def _get_canonical_link(self, article_url, html): doc = self.parser.fromstring(html) return self.extractor.get_canonical_link(article_url, doc) def test_get_canonical_link_rel_canonical(self): url = 'http://www.example.com/article.html' html = ''.format(url) self.assertEqual(self._get_canonical_link('', html), url) def test_get_canonical_link_rel_canonical_absolute_url(self): url = 'http://www.example.com/article.html' html = '' article_url = 'http://www.example.com/article?foo=bar' self.assertEqual(self._get_canonical_link(article_url, html), url) def test_get_canonical_link_og_url_absolute_url(self): url = 'http://www.example.com/article.html' html = '' article_url = 'http://www.example.com/article?foo=bar' self.assertEqual(self._get_canonical_link(article_url, html), url) def test_get_canonical_link_hostname_og_url_absolute_url(self): url = 'http://www.example.com/article.html' html = '' article_url = 'http://www.example.com/article?foo=bar' self.assertEqual(self._get_canonical_link(article_url, html), url) class SourceTestCase(unittest.TestCase): @print_test def test_source_url_input_none(self): with self.assertRaises(Exception): Source(url=None) @unittest.skip("Need to mock download") @print_test def test_source_build(self): """ builds a source object, validates it has no errors, prints out all valid categories and feed urls """ DESC = ('CNN.com International delivers breaking news from across ' 'the globe and information on the latest top stories, ' 'business, sports and entertainment headlines. Follow the ' 'news as it happens through: special reports, videos, ' 'audio, photo galleries plus interactive maps and timelines.') CATEGORY_URLS = [ 'http://cnn.com/ASIA', 'http://connecttheworld.blogs.cnn.com', 'http://cnn.com/HLN', 'http://cnn.com/MIDDLEEAST', 'http://cnn.com', 'http://ireport.cnn.com', 'http://cnn.com/video', 'http://transcripts.cnn.com', 'http://cnn.com/espanol', 'http://partners.cnn.com', 'http://www.cnn.com', 'http://cnn.com/US', 'http://cnn.com/EUROPE', 'http://cnn.com/TRAVEL', 'http://cnn.com/cnni', 'http://cnn.com/SPORT', 'http://cnn.com/mostpopular', 'http://arabic.cnn.com', 'http://cnn.com/WORLD', 'http://cnn.com/LATINAMERICA', 'http://us.cnn.com', 'http://travel.cnn.com', 'http://mexico.cnn.com', 'http://cnn.com/SHOWBIZ', 'http://edition.cnn.com', 'http://amanpour.blogs.cnn.com', 'http://money.cnn.com', 'http://cnn.com/tools/index.html', 'http://cnnespanol.cnn.com', 'http://cnn.com/CNNI', 'http://business.blogs.cnn.com', 'http://cnn.com/AFRICA', 'http://cnn.com/TECH', 'http://cnn.com/BUSINESS'] FEEDS = ['http://rss.cnn.com/rss/edition.rss'] BRAND = 'cnn' s = Source('http://cnn.com', verbose=False, memoize_articles=False) # html = mock_resource_with('http://cnn.com', 'cnn_main_site') s.clean_memo_cache() s.build() # TODO: The rest of the source extraction features will be fully tested # after I figure out a way to sensibly mock the HTTP requests for all # of the category and feeed URLs # assert s.brand == BRAND # assert s.description == DESC # assert s.size() == 266 # assert s.category_urls() == CATEGORY_URLS # TODO: A lot of the feed extraction is NOT being tested because feeds # are primarly extracted from the HTML of category URLs. We lose this # effect by just mocking CNN's main page HTML. Warning: tedious fix. # assert s.feed_urls() == FEEDS @unittest.skip("Need to mock download") @print_test def test_cache_categories(self): """Builds two same source objects in a row examines speeds of both """ url = 'http://uk.yahoo.com' html = mock_resource_with('yahoo_main_site', 'html') s = Source(url) s.download() s.parse() s.set_categories() saved_urls = s.category_urls() s.categories = [] s.set_categories() self.assertCountEqual(saved_urls, s.category_urls()) class UrlTestCase(unittest.TestCase): @print_test def test_valid_urls(self): """Prints out a list of urls with our heuristic guess if it is a valid news url purely based on the url """ from newspaper.urls import valid_url with open(os.path.join(TEST_DIR, 'data/test_urls.txt'), 'r') as f: lines = f.readlines() test_tuples = [tuple(l.strip().split(' ')) for l in lines] # tuples are ('1', 'url_goes_here') form, '1' means valid, # '0' otherwise for lst, url in test_tuples: truth_val = bool(int(lst)) try: self.assertEqual(truth_val, valid_url(url, test=True)) except AssertionError: print('\t\turl: %s is supposed to be %s' % (url, truth_val)) raise @unittest.skip("Need to write an actual test") @print_test def test_prepare_url(self): """Normalizes a url, removes arguments, hashtags. If a relative url, it merges it with the source domain to make an abs url, etc """ pass class APITestCase(unittest.TestCase): @print_test def test_hot_trending(self): """Grab google trending, just make sure this runs """ newspaper.hot() @print_test def test_popular_urls(self): """Just make sure this method runs """ newspaper.popular_urls() @unittest.skip("Need to mock download") class MThreadingTestCase(unittest.TestCase): @print_test def test_download_works(self): config = Configuration() config.memoize_articles = False slate_paper = newspaper.build('http://slate.com', config=config) tc_paper = newspaper.build('http://techcrunch.com', config=config) espn_paper = newspaper.build('http://espn.com', config=config) print(('Slate has %d articles TC has %d articles ESPN has %d articles' % (slate_paper.size(), tc_paper.size(), espn_paper.size()))) papers = [slate_paper, tc_paper, espn_paper] news_pool.set(papers, threads_per_source=2) news_pool.join() print('Downloaded Slate mthread len', len(slate_paper.articles[0].html)) print('Downloaded ESPN mthread len', len(espn_paper.articles[-1].html)) print('Downloaded TC mthread len', len(tc_paper.articles[1].html)) class ConfigBuildTestCase(unittest.TestCase): """Test if our **kwargs to config building setup actually works. NOTE: No need to mock responses as we are just initializing the objects, not actually calling download(..) """ @print_test def test_article_default_params(self): a = Article(url='http://www.cnn.com/2013/11/27/' 'travel/weather-thanksgiving/index.html') self.assertEqual('en', a.config.language) self.assertTrue(a.config.memoize_articles) self.assertTrue(a.config.use_meta_language) @print_test def test_article_custom_params(self): a = Article(url='http://www.cnn.com/2013/11/27/travel/' 'weather-thanksgiving/index.html', language='zh', memoize_articles=False) self.assertEqual('zh', a.config.language) self.assertFalse(a.config.memoize_articles) self.assertFalse(a.config.use_meta_language) @print_test def test_source_default_params(self): s = Source(url='http://cnn.com') self.assertEqual('en', s.config.language) self.assertEqual(20000, s.config.MAX_FILE_MEMO) self.assertTrue(s.config.memoize_articles) self.assertTrue(s.config.use_meta_language) @print_test def test_source_custom_params(self): s = Source(url="http://cnn.com", memoize_articles=False, MAX_FILE_MEMO=10000, language='en') self.assertFalse(s.config.memoize_articles) self.assertEqual(10000, s.config.MAX_FILE_MEMO) self.assertEqual('en', s.config.language) self.assertFalse(s.config.use_meta_language) class MultiLanguageTestCase(unittest.TestCase): @print_test def test_chinese_fulltext_extract(self): url = 'http://news.sohu.com/20050601/n225789219.shtml' article = Article(url=url, language='zh') html = mock_resource_with('chinese_article', 'html') article.download(html) article.parse() text = mock_resource_with('chinese', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'zh')) @print_test def test_arabic_fulltext_extract(self): url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/' \ 'index.html' article = Article(url=url) html = mock_resource_with('arabic_article', 'html') article.download(html) article.parse() self.assertEqual('ar', article.meta_lang) text = mock_resource_with('arabic', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'ar')) @print_test def test_spanish_fulltext_extract(self): url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal' \ 'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html' article = Article(url=url, language='es') html = mock_resource_with('spanish_article', 'html') article.download(html) article.parse() text = mock_resource_with('spanish', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'es')) if __name__ == '__main__': argv = list(sys.argv) if 'fulltext' in argv: argv.remove('fulltext') # remove it here, so it doesn't pass to unittest unittest.main(verbosity=0, argv=argv)