Rework other extract commands to use the archive.org files

palewire · Dec 4, 2022 · ed80bbc · ed80bbc
1 parent 02264c7
commit ed80bbc
Show file tree

Hide file tree

Showing 6 changed files with 28 additions and 27 deletions.
diff --git a/newshomepages/extract/accessibility.py b/newshomepages/extract/accessibility.py
@@ -31,7 +31,7 @@ def accessibility(handle):
     print(f"{len(site_df)} accessibility files found")
 
     # Read in the output file
-    output_path = utils.ANALYSIS_DIR / f"{handle.lower()}-accessibility.csv"
+    output_path = utils.THIS_DIR / f"{handle.lower()}-accessibility.csv"
     try:
         output_df = pd.read_csv(output_path)
         downloaded_files = set(output_df.file_url.unique())

diff --git a/newshomepages/extract/hyperlinks.py b/newshomepages/extract/hyperlinks.py
@@ -21,14 +21,14 @@ def cli():
 @click.option("--language", "language", default=None)
 @click.option("--bundle", "bundle", default=None)
 @click.option("--days", "days", default="90")
-@click.option("--output-path", "output_path", default=None)
+@click.option("-o", "--output-path", "output_path", default=None)
 def hyperlinks(
     site: typing.Optional[str] = None,
     country: typing.Optional[str] = None,
     language: typing.Optional[str] = None,
     bundle: typing.Optional[str] = None,
     days: typing.Optional[str] = None,
-    output_path: typing.Optional[str] = None,
+    output_path: typing.Optional[typing.Any] = None,
 ):
     """Download and parse the provided site's hyperlinks files."""
     # Get all lighthouse files
@@ -84,8 +84,9 @@ def _parse_hyperlinks(row):
 
     # Write out the file
     if output_path is None:
-        output_path_obj = utils.ANALYSIS_DIR / f"{slug}-hyperlinks.csv"
+        output_path = f"{slug}-hyperlinks.csv"
     else:
-        output_path_obj = pathlib.Path(output_path)
-    print(f":pencil: Writing {len(flat_df)} rows to {output_path_obj}")
-    flat_df.to_csv(output_path_obj, index=False)
+        output_path = pathlib.Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+    print(f":pencil: Writing {len(flat_df)} rows to {output_path}")
+    flat_df.to_csv(output_path, index=False)
diff --git a/newshomepages/extract/lighthouse.py b/newshomepages/extract/lighthouse.py
@@ -21,14 +21,14 @@ def cli():
 @click.option("--language", "language", default=None)
 @click.option("--bundle", "bundle", default=None)
 @click.option("--days", "days", default=None)
-@click.option("--output-path", "output_path", default=None)
+@click.option("-o", "--output-path", "output_path", default=None)
 def lighthouse(
     site: typing.Optional[str] = None,
     country: typing.Optional[str] = None,
     language: typing.Optional[str] = None,
     bundle: typing.Optional[str] = None,
     days: typing.Optional[str] = None,
-    output_path: typing.Optional[str] = None,
+    output_path: typing.Optional[typing.Any] = None,
 ):
     """Download and parse the provided site's Lighthouse files."""
     # Get all lighthouse files
@@ -123,8 +123,9 @@ def _parse_metrics(row):
 
     # Write out the file
     if output_path is None:
-        output_path_obj = utils.ANALYSIS_DIR / f"{slug}-lighthouse.csv"
+        output_path = f"{slug}-lighthouse.csv"
     else:
-        output_path_obj = pathlib.Path(output_path)
-    print(f":pencil: Writing {len(trimmed_df)} rows to {output_path_obj}")
-    trimmed_df.to_csv(output_path_obj, index=False)
+        output_path = pathlib.Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+    print(f":pencil: Writing {len(trimmed_df)} rows to {output_path}")
+    trimmed_df.to_csv(output_path, index=False)
diff --git a/newshomepages/extract/utils.py b/newshomepages/extract/utils.py
@@ -2,8 +2,6 @@
 from urllib.parse import urlparse
 
 import pandas as pd
-import requests
-from requests.adapters import HTTPAdapter, Retry
 from rich import print
 
 from .. import utils
@@ -21,12 +19,7 @@ def _get_json_url(url):
         return pd.read_json(output_path)
     else:
         # Get the URL
-        print(f":link: Downloading {url}")
-        s = requests.Session()
-        retries = Retry(total=3, backoff_factor=1)
-        s.mount("https://", HTTPAdapter(max_retries=retries))
-        r = s.get(url)
-        data = r.json()
+        data = utils.get_json_url(url)
 
         # Parse as a dataframe
         df = pd.DataFrame(data)

diff --git a/newshomepages/extract/wayback.py b/newshomepages/extract/wayback.py
@@ -29,7 +29,7 @@ def wayback(handle):
     print(f"{len(site_df)} wayback files found")
 
     # Read in the output file
-    output_path = utils.ANALYSIS_DIR / f"{handle.lower()}-wayback.csv"
+    output_path = utils.THIS_DIR / f"{handle.lower()}-wayback.csv"
     try:
         output_df = pd.read_csv(output_path)
         downloaded_files = set(output_df.file_url.unique())

diff --git a/newshomepages/utils.py b/newshomepages/utils.py
@@ -16,6 +16,7 @@
 import tldextract
 from playwright.sync_api._generated import BrowserContext, Playwright
 from retry import retry
+from rich import print
 
 # Set paths for key files
 THIS_DIR = Path(__file__).parent.absolute()
@@ -26,7 +27,6 @@
 EXTRACT_DIR = THIS_DIR.parent / "extracts"
 NOTEBOOKS_DIR = THIS_DIR.parent / "notebooks"
 SITE_DIR = THIS_DIR.parent / "_site"
-ANALYSIS_DIR = THIS_DIR.parent / "_analysis"
 
 # Regular expressions
 LEADING_UNDERSCORES = re.compile("^(_+)")
@@ -129,10 +129,13 @@ def parse_archive_artifact(url_list: typing.List) -> typing.Dict:
     return d
 
 
+@retry(tries=3, delay=15, backoff=2)
 def get_extract_df(name: str, **kwargs) -> pd.DataFrame:
     """Read in the requests extracts CSV as a dataframe."""
-    base_url = "https://news-homepages.s3.us-west-1.amazonaws.com/extracts/csv/"
-    return pd.read_csv(f"{base_url}{name}", **kwargs)
+    base_url = "https://archive.org/download/news-homepages-extracts/"
+    url = f"{base_url}{name}"
+    print(f"Fetching {url}")
+    return pd.read_csv(url, **kwargs)
 
 
 def get_user_agent() -> str:
@@ -404,10 +407,13 @@ def get_wayback_df() -> pd.DataFrame:
     return _get_extract_files_df("wayback-files.csv")
 
 
+@retry(tries=3, delay=15, backoff=2)
 def _get_extract_files_df(name) -> pd.DataFrame:
-    base_url = "https://news-homepages.s3.us-west-1.amazonaws.com/extracts/csv/"
+    base_url = "https://archive.org/download/news-homepages-extracts/"
+    url = f"{base_url}{name}"
+    print(f"Fetching {url}")
     df = pd.read_csv(
-        f"{base_url}{name}",
+        url,
         parse_dates=["mtime"],
         usecols=[
             "identifier",