From ed80bbce1bcd7398a0f47fa1f355f4cc21c9142c Mon Sep 17 00:00:00 2001 From: palewire Date: Sun, 4 Dec 2022 06:50:40 -0800 Subject: [PATCH] Rework other extract commands to use the archive.org files --- newshomepages/extract/accessibility.py | 2 +- newshomepages/extract/hyperlinks.py | 13 +++++++------ newshomepages/extract/lighthouse.py | 13 +++++++------ newshomepages/extract/utils.py | 9 +-------- newshomepages/extract/wayback.py | 2 +- newshomepages/utils.py | 16 +++++++++++----- 6 files changed, 28 insertions(+), 27 deletions(-) diff --git a/newshomepages/extract/accessibility.py b/newshomepages/extract/accessibility.py index c894ccd14ef..3873873e075 100644 --- a/newshomepages/extract/accessibility.py +++ b/newshomepages/extract/accessibility.py @@ -31,7 +31,7 @@ def accessibility(handle): print(f"{len(site_df)} accessibility files found") # Read in the output file - output_path = utils.ANALYSIS_DIR / f"{handle.lower()}-accessibility.csv" + output_path = utils.THIS_DIR / f"{handle.lower()}-accessibility.csv" try: output_df = pd.read_csv(output_path) downloaded_files = set(output_df.file_url.unique()) diff --git a/newshomepages/extract/hyperlinks.py b/newshomepages/extract/hyperlinks.py index f8aeb231c2d..4147c1afef6 100644 --- a/newshomepages/extract/hyperlinks.py +++ b/newshomepages/extract/hyperlinks.py @@ -21,14 +21,14 @@ def cli(): @click.option("--language", "language", default=None) @click.option("--bundle", "bundle", default=None) @click.option("--days", "days", default="90") -@click.option("--output-path", "output_path", default=None) +@click.option("-o", "--output-path", "output_path", default=None) def hyperlinks( site: typing.Optional[str] = None, country: typing.Optional[str] = None, language: typing.Optional[str] = None, bundle: typing.Optional[str] = None, days: typing.Optional[str] = None, - output_path: typing.Optional[str] = None, + output_path: typing.Optional[typing.Any] = None, ): """Download and parse the provided site's hyperlinks files.""" # Get all lighthouse files @@ -84,8 +84,9 @@ def _parse_hyperlinks(row): # Write out the file if output_path is None: - output_path_obj = utils.ANALYSIS_DIR / f"{slug}-hyperlinks.csv" + output_path = f"{slug}-hyperlinks.csv" else: - output_path_obj = pathlib.Path(output_path) - print(f":pencil: Writing {len(flat_df)} rows to {output_path_obj}") - flat_df.to_csv(output_path_obj, index=False) + output_path = pathlib.Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + print(f":pencil: Writing {len(flat_df)} rows to {output_path}") + flat_df.to_csv(output_path, index=False) diff --git a/newshomepages/extract/lighthouse.py b/newshomepages/extract/lighthouse.py index 615ff392bd0..094b67e0792 100644 --- a/newshomepages/extract/lighthouse.py +++ b/newshomepages/extract/lighthouse.py @@ -21,14 +21,14 @@ def cli(): @click.option("--language", "language", default=None) @click.option("--bundle", "bundle", default=None) @click.option("--days", "days", default=None) -@click.option("--output-path", "output_path", default=None) +@click.option("-o", "--output-path", "output_path", default=None) def lighthouse( site: typing.Optional[str] = None, country: typing.Optional[str] = None, language: typing.Optional[str] = None, bundle: typing.Optional[str] = None, days: typing.Optional[str] = None, - output_path: typing.Optional[str] = None, + output_path: typing.Optional[typing.Any] = None, ): """Download and parse the provided site's Lighthouse files.""" # Get all lighthouse files @@ -123,8 +123,9 @@ def _parse_metrics(row): # Write out the file if output_path is None: - output_path_obj = utils.ANALYSIS_DIR / f"{slug}-lighthouse.csv" + output_path = f"{slug}-lighthouse.csv" else: - output_path_obj = pathlib.Path(output_path) - print(f":pencil: Writing {len(trimmed_df)} rows to {output_path_obj}") - trimmed_df.to_csv(output_path_obj, index=False) + output_path = pathlib.Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + print(f":pencil: Writing {len(trimmed_df)} rows to {output_path}") + trimmed_df.to_csv(output_path, index=False) diff --git a/newshomepages/extract/utils.py b/newshomepages/extract/utils.py index 3a5a77fa9f7..b678e226088 100644 --- a/newshomepages/extract/utils.py +++ b/newshomepages/extract/utils.py @@ -2,8 +2,6 @@ from urllib.parse import urlparse import pandas as pd -import requests -from requests.adapters import HTTPAdapter, Retry from rich import print from .. import utils @@ -21,12 +19,7 @@ def _get_json_url(url): return pd.read_json(output_path) else: # Get the URL - print(f":link: Downloading {url}") - s = requests.Session() - retries = Retry(total=3, backoff_factor=1) - s.mount("https://", HTTPAdapter(max_retries=retries)) - r = s.get(url) - data = r.json() + data = utils.get_json_url(url) # Parse as a dataframe df = pd.DataFrame(data) diff --git a/newshomepages/extract/wayback.py b/newshomepages/extract/wayback.py index afcc1d18682..5970352a82b 100644 --- a/newshomepages/extract/wayback.py +++ b/newshomepages/extract/wayback.py @@ -29,7 +29,7 @@ def wayback(handle): print(f"{len(site_df)} wayback files found") # Read in the output file - output_path = utils.ANALYSIS_DIR / f"{handle.lower()}-wayback.csv" + output_path = utils.THIS_DIR / f"{handle.lower()}-wayback.csv" try: output_df = pd.read_csv(output_path) downloaded_files = set(output_df.file_url.unique()) diff --git a/newshomepages/utils.py b/newshomepages/utils.py index b05de0273db..afdb1aa55ba 100644 --- a/newshomepages/utils.py +++ b/newshomepages/utils.py @@ -16,6 +16,7 @@ import tldextract from playwright.sync_api._generated import BrowserContext, Playwright from retry import retry +from rich import print # Set paths for key files THIS_DIR = Path(__file__).parent.absolute() @@ -26,7 +27,6 @@ EXTRACT_DIR = THIS_DIR.parent / "extracts" NOTEBOOKS_DIR = THIS_DIR.parent / "notebooks" SITE_DIR = THIS_DIR.parent / "_site" -ANALYSIS_DIR = THIS_DIR.parent / "_analysis" # Regular expressions LEADING_UNDERSCORES = re.compile("^(_+)") @@ -129,10 +129,13 @@ def parse_archive_artifact(url_list: typing.List) -> typing.Dict: return d +@retry(tries=3, delay=15, backoff=2) def get_extract_df(name: str, **kwargs) -> pd.DataFrame: """Read in the requests extracts CSV as a dataframe.""" - base_url = "https://news-homepages.s3.us-west-1.amazonaws.com/extracts/csv/" - return pd.read_csv(f"{base_url}{name}", **kwargs) + base_url = "https://archive.org/download/news-homepages-extracts/" + url = f"{base_url}{name}" + print(f"Fetching {url}") + return pd.read_csv(url, **kwargs) def get_user_agent() -> str: @@ -404,10 +407,13 @@ def get_wayback_df() -> pd.DataFrame: return _get_extract_files_df("wayback-files.csv") +@retry(tries=3, delay=15, backoff=2) def _get_extract_files_df(name) -> pd.DataFrame: - base_url = "https://news-homepages.s3.us-west-1.amazonaws.com/extracts/csv/" + base_url = "https://archive.org/download/news-homepages-extracts/" + url = f"{base_url}{name}" + print(f"Fetching {url}") df = pd.read_csv( - f"{base_url}{name}", + url, parse_dates=["mtime"], usecols=[ "identifier",