Skip to content

Commit

Permalink
Start breaking up the big extract command into segments. Refactor the…
Browse files Browse the repository at this point in the history
… items download a bit. Added a test
  • Loading branch information
palewire committed Nov 19, 2022
1 parent 64a0aee commit 8f57121
Show file tree
Hide file tree
Showing 6 changed files with 138 additions and 69 deletions.
3 changes: 3 additions & 0 deletions newshomepages/extract/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .cli import cli_group as cli

__all__ = ("cli",)
4 changes: 4 additions & 0 deletions newshomepages/extract/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .cli import cli_group

if __name__ == "__main__":
cli_group()
74 changes: 5 additions & 69 deletions newshomepages/extract.py → newshomepages/extract/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,20 @@
import json
import os
import pathlib
import re
import time
import typing
from datetime import datetime
from urllib.parse import urlparse

import click
import internetarchive
import pandas as pd
import requests
from requests.adapters import HTTPAdapter, Retry
from retry import retry
from rich import print
from rich.progress import track

from . import utils
from .. import utils
from .items import cli as download_items

IA_ACCESS_KEY = os.getenv("IA_ACCESS_KEY")
IA_SECRET_KEY = os.getenv("IA_SECRET_KEY")
Expand All @@ -32,70 +30,6 @@ def cli():
pass


@cli.command()
@click.option("-y", "--year", "year", default=CURRENT_YEAR)
@click.option("--site", "site", default=None)
@click.option("--country", "country", default=None)
@click.option("--language", "language", default=None)
@click.option("--bundle", "bundle", default=None)
@click.option("--batch", "batch", default=None)
@click.option("-o", "--output-path", "output_path", default=utils.EXTRACT_DIR / "json")
@click.option("--wait", "wait", default=5, help="How long to pause between requests")
def download_items(
year: str,
site: typing.Optional[str] = None,
country: typing.Optional[str] = None,
language: typing.Optional[str] = None,
bundle: typing.Optional[str] = None,
batch: typing.Optional[str] = None,
output_path=utils.EXTRACT_DIR / "json",
wait: float = 5,
):
"""Download the full list of Internet Archive items as JSON."""
assert IA_COLLECTION

@retry(tries=3, delay=30, backoff=2)
def _save_item(item):
# Save it locally
output_obj = pathlib.Path(output_path)
output_obj.mkdir(parents=True, exist_ok=True)
with open(output_obj / f"{item.identifier}.json", "w") as fh:
json.dump(item.item_metadata, fh, indent=2)
time.sleep(wait)

@retry(tries=3, delay=30, backoff=2)
def _site_search(s):
s = s["handle"].lower()
# Replace any leading underscores, which don't work on archive.org
s = re.sub("^(_+)", "", s)
search = f"collection:{IA_COLLECTION} AND identifier:({s}-{year})"
return internetarchive.search_items(search).iter_as_items()

# If the user has provided a way to filter to a subset of sites, pull em out
if site:
site_list = [utils.get_site(site)]
elif country:
site_list = utils.get_sites_in_country(country)
elif language:
site_list = utils.get_sites_in_language(language)
elif bundle:
site_list = utils.get_sites_in_bundle(bundle)
elif batch:
site_list = utils.get_sites_in_batch(int(batch))
else:
site_list = None

# If we're filtering go get those
if site_list:
for obj in track(site_list):
[_save_item(i) for i in _site_search(obj)]
# Otherwise, go get all items in the collection from this year
else:
search = f"collection:{IA_COLLECTION} AND identifier:(*-{year})"
item_list = internetarchive.search_items(search).iter_as_items()
[_save_item(i) for i in item_list]


@cli.command()
@click.argument("handle")
def download_accessibility(handle):
Expand Down Expand Up @@ -547,5 +481,7 @@ def _get_json_url(url):
return df


cli_group = click.CommandCollection(sources=[cli, download_items])

if __name__ == "__main__":
cli()
cli_group()
107 changes: 107 additions & 0 deletions newshomepages/extract/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import os
import time
import typing
from datetime import datetime
from pathlib import Path

import click
import internetarchive
from retry import retry
from rich import print

from .. import utils

IA_COLLECTION = os.getenv("IA_COLLECTION")


@click.group()
def cli():
"""Download items from our archive.org collection as JSON."""
pass


@cli.command()
@click.option("-y", "--year", "year", default=None)
@click.option("--site", "site", default=None)
@click.option("--country", "country", default=None)
@click.option("--language", "language", default=None)
@click.option("--bundle", "bundle", default=None)
@click.option("--batch", "batch", default=None)
@click.option("-o", "--output-path", "output_path", default="./")
@click.option("--wait", "wait", default="0", help="How long to pause between requests")
def items(
year: typing.Optional[typing.Any] = None,
site: typing.Optional[str] = None,
country: typing.Optional[str] = None,
language: typing.Optional[str] = None,
bundle: typing.Optional[str] = None,
batch: typing.Optional[str] = None,
output_path: str = "./",
wait: typing.Any = "0",
):
"""Download items from our archive.org collection as JSON."""
# Set some variables for later
assert IA_COLLECTION
wait = float(wait)
if year:
year = int(year)
else:
year = datetime.now().year

@retry(tries=3, delay=30, backoff=2)
def _save_item(item):
"""Save an item as JSON to disk."""
utils.write_json(
item.item_metadata, Path(output_path) / f"{item.identifier}.json"
)
if wait:
print(f"Waiting {wait} seconds")
time.sleep(wait)

@retry(tries=3, delay=30, backoff=2)
def _site_search(s):
"""Search archive.org for items to download."""
s = utils.safe_ia_handle(s["handle"])
search = f"collection:{IA_COLLECTION} AND identifier:({s}-{year})"
return internetarchive.search_items(search).iter_as_items()

# If the user has provided a way to filter to a subset of sites, pull em out
if site:
site_list = [utils.get_site(site)]
elif country:
print(
f"Downloading items for country `{country}` from archive.org collection `{IA_COLLECTION}`"
)
site_list = utils.get_sites_in_country(country)
elif language:
print(
f"Downloading items for language `{language}` from archive.org collection `{IA_COLLECTION}`"
)
site_list = utils.get_sites_in_language(language)
elif bundle:
print(
f"Downloading items for bundle `{bundle}` from archive.org collection `{IA_COLLECTION}`"
)
site_list = utils.get_sites_in_bundle(bundle)
elif batch:
print(
f"Downloading items for batch `{batch}` from archive.org collection `{IA_COLLECTION}`"
)
site_list = utils.get_sites_in_batch(int(batch))
else:
print(f"Downloading all items from archive.org collection `{IA_COLLECTION}`")
site_list = None

# If we're filtering go get those
if site_list:
for obj in site_list:
print(
f"Downloading items for site `{obj['handle']}` from archive.org collection `{IA_COLLECTION}`"
)
[_save_item(i) for i in _site_search(obj)]

# Otherwise, go get all items in the collection from this year
else:
search = f"collection:{IA_COLLECTION} AND identifier:(*-{year})"
item_list = internetarchive.search_items(search).iter_as_items()
[_save_item(i) for i in item_list]
9 changes: 9 additions & 0 deletions newshomepages/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import csv
import json
import re
import tempfile
import time
Expand Down Expand Up @@ -41,6 +42,14 @@ def safe_ia_handle(s):
return s


def write_json(data: typing.Any, path: Path, indent: int = 2):
"""Write JSON data to the provided path."""
path.parent.mkdir(parents=True, exist_ok=True)
print(f"📥 Writing JSON to {path}")
with open(path, "w") as fh:
json.dump(data, fh, indent=2)


def parse_archive_url(url: str):
"""Parse the handle and timestamp from an archive.org URL."""
o = urlparse(url)
Expand Down
10 changes: 10 additions & 0 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from click.testing import CliRunner

from newshomepages.extract import cli


def test_item(tmp_path):
"""Test a site's item download."""
runner = CliRunner()
result = runner.invoke(cli, ["items", "--site=latimes", f"-o={tmp_path}"])
assert result.exit_code == 0

0 comments on commit 8f57121

Please sign in to comment.