Skip to content

Commit

Permalink
Rework other extract commands to use the archive.org files
Browse files Browse the repository at this point in the history
  • Loading branch information
palewire committed Dec 4, 2022
1 parent 02264c7 commit ed80bbc
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 27 deletions.
2 changes: 1 addition & 1 deletion newshomepages/extract/accessibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def accessibility(handle):
print(f"{len(site_df)} accessibility files found")

# Read in the output file
output_path = utils.ANALYSIS_DIR / f"{handle.lower()}-accessibility.csv"
output_path = utils.THIS_DIR / f"{handle.lower()}-accessibility.csv"
try:
output_df = pd.read_csv(output_path)
downloaded_files = set(output_df.file_url.unique())
Expand Down
13 changes: 7 additions & 6 deletions newshomepages/extract/hyperlinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ def cli():
@click.option("--language", "language", default=None)
@click.option("--bundle", "bundle", default=None)
@click.option("--days", "days", default="90")
@click.option("--output-path", "output_path", default=None)
@click.option("-o", "--output-path", "output_path", default=None)
def hyperlinks(
site: typing.Optional[str] = None,
country: typing.Optional[str] = None,
language: typing.Optional[str] = None,
bundle: typing.Optional[str] = None,
days: typing.Optional[str] = None,
output_path: typing.Optional[str] = None,
output_path: typing.Optional[typing.Any] = None,
):
"""Download and parse the provided site's hyperlinks files."""
# Get all lighthouse files
Expand Down Expand Up @@ -84,8 +84,9 @@ def _parse_hyperlinks(row):

# Write out the file
if output_path is None:
output_path_obj = utils.ANALYSIS_DIR / f"{slug}-hyperlinks.csv"
output_path = f"{slug}-hyperlinks.csv"
else:
output_path_obj = pathlib.Path(output_path)
print(f":pencil: Writing {len(flat_df)} rows to {output_path_obj}")
flat_df.to_csv(output_path_obj, index=False)
output_path = pathlib.Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
print(f":pencil: Writing {len(flat_df)} rows to {output_path}")
flat_df.to_csv(output_path, index=False)
13 changes: 7 additions & 6 deletions newshomepages/extract/lighthouse.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ def cli():
@click.option("--language", "language", default=None)
@click.option("--bundle", "bundle", default=None)
@click.option("--days", "days", default=None)
@click.option("--output-path", "output_path", default=None)
@click.option("-o", "--output-path", "output_path", default=None)
def lighthouse(
site: typing.Optional[str] = None,
country: typing.Optional[str] = None,
language: typing.Optional[str] = None,
bundle: typing.Optional[str] = None,
days: typing.Optional[str] = None,
output_path: typing.Optional[str] = None,
output_path: typing.Optional[typing.Any] = None,
):
"""Download and parse the provided site's Lighthouse files."""
# Get all lighthouse files
Expand Down Expand Up @@ -123,8 +123,9 @@ def _parse_metrics(row):

# Write out the file
if output_path is None:
output_path_obj = utils.ANALYSIS_DIR / f"{slug}-lighthouse.csv"
output_path = f"{slug}-lighthouse.csv"
else:
output_path_obj = pathlib.Path(output_path)
print(f":pencil: Writing {len(trimmed_df)} rows to {output_path_obj}")
trimmed_df.to_csv(output_path_obj, index=False)
output_path = pathlib.Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
print(f":pencil: Writing {len(trimmed_df)} rows to {output_path}")
trimmed_df.to_csv(output_path, index=False)
9 changes: 1 addition & 8 deletions newshomepages/extract/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
from urllib.parse import urlparse

import pandas as pd
import requests
from requests.adapters import HTTPAdapter, Retry
from rich import print

from .. import utils
Expand All @@ -21,12 +19,7 @@ def _get_json_url(url):
return pd.read_json(output_path)
else:
# Get the URL
print(f":link: Downloading {url}")
s = requests.Session()
retries = Retry(total=3, backoff_factor=1)
s.mount("https://", HTTPAdapter(max_retries=retries))
r = s.get(url)
data = r.json()
data = utils.get_json_url(url)

# Parse as a dataframe
df = pd.DataFrame(data)
Expand Down
2 changes: 1 addition & 1 deletion newshomepages/extract/wayback.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def wayback(handle):
print(f"{len(site_df)} wayback files found")

# Read in the output file
output_path = utils.ANALYSIS_DIR / f"{handle.lower()}-wayback.csv"
output_path = utils.THIS_DIR / f"{handle.lower()}-wayback.csv"
try:
output_df = pd.read_csv(output_path)
downloaded_files = set(output_df.file_url.unique())
Expand Down
16 changes: 11 additions & 5 deletions newshomepages/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import tldextract
from playwright.sync_api._generated import BrowserContext, Playwright
from retry import retry
from rich import print

# Set paths for key files
THIS_DIR = Path(__file__).parent.absolute()
Expand All @@ -26,7 +27,6 @@
EXTRACT_DIR = THIS_DIR.parent / "extracts"
NOTEBOOKS_DIR = THIS_DIR.parent / "notebooks"
SITE_DIR = THIS_DIR.parent / "_site"
ANALYSIS_DIR = THIS_DIR.parent / "_analysis"

# Regular expressions
LEADING_UNDERSCORES = re.compile("^(_+)")
Expand Down Expand Up @@ -129,10 +129,13 @@ def parse_archive_artifact(url_list: typing.List) -> typing.Dict:
return d


@retry(tries=3, delay=15, backoff=2)
def get_extract_df(name: str, **kwargs) -> pd.DataFrame:
"""Read in the requests extracts CSV as a dataframe."""
base_url = "https://news-homepages.s3.us-west-1.amazonaws.com/extracts/csv/"
return pd.read_csv(f"{base_url}{name}", **kwargs)
base_url = "https://archive.org/download/news-homepages-extracts/"
url = f"{base_url}{name}"
print(f"Fetching {url}")
return pd.read_csv(url, **kwargs)


def get_user_agent() -> str:
Expand Down Expand Up @@ -404,10 +407,13 @@ def get_wayback_df() -> pd.DataFrame:
return _get_extract_files_df("wayback-files.csv")


@retry(tries=3, delay=15, backoff=2)
def _get_extract_files_df(name) -> pd.DataFrame:
base_url = "https://news-homepages.s3.us-west-1.amazonaws.com/extracts/csv/"
base_url = "https://archive.org/download/news-homepages-extracts/"
url = f"{base_url}{name}"
print(f"Fetching {url}")
df = pd.read_csv(
f"{base_url}{name}",
url,
parse_dates=["mtime"],
usecols=[
"identifier",
Expand Down

0 comments on commit ed80bbc

Please sign in to comment.