diff --git a/newshomepages/extract/__init__.py b/newshomepages/extract/__init__.py new file mode 100644 index 00000000000..35165ad62ae --- /dev/null +++ b/newshomepages/extract/__init__.py @@ -0,0 +1,3 @@ +from .cli import cli_group as cli + +__all__ = ("cli",) diff --git a/newshomepages/extract/__main__.py b/newshomepages/extract/__main__.py new file mode 100644 index 00000000000..76eebfa5dd6 --- /dev/null +++ b/newshomepages/extract/__main__.py @@ -0,0 +1,4 @@ +from .cli import cli_group + +if __name__ == "__main__": + cli_group() diff --git a/newshomepages/extract.py b/newshomepages/extract/cli.py similarity index 85% rename from newshomepages/extract.py rename to newshomepages/extract/cli.py index e132064ddbb..32ff5222bb7 100644 --- a/newshomepages/extract.py +++ b/newshomepages/extract/cli.py @@ -2,22 +2,20 @@ import json import os import pathlib -import re import time import typing from datetime import datetime from urllib.parse import urlparse import click -import internetarchive import pandas as pd import requests from requests.adapters import HTTPAdapter, Retry -from retry import retry from rich import print from rich.progress import track -from . import utils +from .. import utils +from .items import cli as download_items IA_ACCESS_KEY = os.getenv("IA_ACCESS_KEY") IA_SECRET_KEY = os.getenv("IA_SECRET_KEY") @@ -32,70 +30,6 @@ def cli(): pass -@cli.command() -@click.option("-y", "--year", "year", default=CURRENT_YEAR) -@click.option("--site", "site", default=None) -@click.option("--country", "country", default=None) -@click.option("--language", "language", default=None) -@click.option("--bundle", "bundle", default=None) -@click.option("--batch", "batch", default=None) -@click.option("-o", "--output-path", "output_path", default=utils.EXTRACT_DIR / "json") -@click.option("--wait", "wait", default=5, help="How long to pause between requests") -def download_items( - year: str, - site: typing.Optional[str] = None, - country: typing.Optional[str] = None, - language: typing.Optional[str] = None, - bundle: typing.Optional[str] = None, - batch: typing.Optional[str] = None, - output_path=utils.EXTRACT_DIR / "json", - wait: float = 5, -): - """Download the full list of Internet Archive items as JSON.""" - assert IA_COLLECTION - - @retry(tries=3, delay=30, backoff=2) - def _save_item(item): - # Save it locally - output_obj = pathlib.Path(output_path) - output_obj.mkdir(parents=True, exist_ok=True) - with open(output_obj / f"{item.identifier}.json", "w") as fh: - json.dump(item.item_metadata, fh, indent=2) - time.sleep(wait) - - @retry(tries=3, delay=30, backoff=2) - def _site_search(s): - s = s["handle"].lower() - # Replace any leading underscores, which don't work on archive.org - s = re.sub("^(_+)", "", s) - search = f"collection:{IA_COLLECTION} AND identifier:({s}-{year})" - return internetarchive.search_items(search).iter_as_items() - - # If the user has provided a way to filter to a subset of sites, pull em out - if site: - site_list = [utils.get_site(site)] - elif country: - site_list = utils.get_sites_in_country(country) - elif language: - site_list = utils.get_sites_in_language(language) - elif bundle: - site_list = utils.get_sites_in_bundle(bundle) - elif batch: - site_list = utils.get_sites_in_batch(int(batch)) - else: - site_list = None - - # If we're filtering go get those - if site_list: - for obj in track(site_list): - [_save_item(i) for i in _site_search(obj)] - # Otherwise, go get all items in the collection from this year - else: - search = f"collection:{IA_COLLECTION} AND identifier:(*-{year})" - item_list = internetarchive.search_items(search).iter_as_items() - [_save_item(i) for i in item_list] - - @cli.command() @click.argument("handle") def download_accessibility(handle): @@ -547,5 +481,7 @@ def _get_json_url(url): return df +cli_group = click.CommandCollection(sources=[cli, download_items]) + if __name__ == "__main__": - cli() + cli_group() diff --git a/newshomepages/extract/items.py b/newshomepages/extract/items.py new file mode 100644 index 00000000000..2ba5a1fb631 --- /dev/null +++ b/newshomepages/extract/items.py @@ -0,0 +1,107 @@ +import os +import time +import typing +from datetime import datetime +from pathlib import Path + +import click +import internetarchive +from retry import retry +from rich import print + +from .. import utils + +IA_COLLECTION = os.getenv("IA_COLLECTION") + + +@click.group() +def cli(): + """Download items from our archive.org collection as JSON.""" + pass + + +@cli.command() +@click.option("-y", "--year", "year", default=None) +@click.option("--site", "site", default=None) +@click.option("--country", "country", default=None) +@click.option("--language", "language", default=None) +@click.option("--bundle", "bundle", default=None) +@click.option("--batch", "batch", default=None) +@click.option("-o", "--output-path", "output_path", default="./") +@click.option("--wait", "wait", default="0", help="How long to pause between requests") +def items( + year: typing.Optional[typing.Any] = None, + site: typing.Optional[str] = None, + country: typing.Optional[str] = None, + language: typing.Optional[str] = None, + bundle: typing.Optional[str] = None, + batch: typing.Optional[str] = None, + output_path: str = "./", + wait: typing.Any = "0", +): + """Download items from our archive.org collection as JSON.""" + # Set some variables for later + assert IA_COLLECTION + wait = float(wait) + if year: + year = int(year) + else: + year = datetime.now().year + + @retry(tries=3, delay=30, backoff=2) + def _save_item(item): + """Save an item as JSON to disk.""" + utils.write_json( + item.item_metadata, Path(output_path) / f"{item.identifier}.json" + ) + if wait: + print(f"Waiting {wait} seconds") + time.sleep(wait) + + @retry(tries=3, delay=30, backoff=2) + def _site_search(s): + """Search archive.org for items to download.""" + s = utils.safe_ia_handle(s["handle"]) + search = f"collection:{IA_COLLECTION} AND identifier:({s}-{year})" + return internetarchive.search_items(search).iter_as_items() + + # If the user has provided a way to filter to a subset of sites, pull em out + if site: + site_list = [utils.get_site(site)] + elif country: + print( + f"Downloading items for country `{country}` from archive.org collection `{IA_COLLECTION}`" + ) + site_list = utils.get_sites_in_country(country) + elif language: + print( + f"Downloading items for language `{language}` from archive.org collection `{IA_COLLECTION}`" + ) + site_list = utils.get_sites_in_language(language) + elif bundle: + print( + f"Downloading items for bundle `{bundle}` from archive.org collection `{IA_COLLECTION}`" + ) + site_list = utils.get_sites_in_bundle(bundle) + elif batch: + print( + f"Downloading items for batch `{batch}` from archive.org collection `{IA_COLLECTION}`" + ) + site_list = utils.get_sites_in_batch(int(batch)) + else: + print(f"Downloading all items from archive.org collection `{IA_COLLECTION}`") + site_list = None + + # If we're filtering go get those + if site_list: + for obj in site_list: + print( + f"Downloading items for site `{obj['handle']}` from archive.org collection `{IA_COLLECTION}`" + ) + [_save_item(i) for i in _site_search(obj)] + + # Otherwise, go get all items in the collection from this year + else: + search = f"collection:{IA_COLLECTION} AND identifier:(*-{year})" + item_list = internetarchive.search_items(search).iter_as_items() + [_save_item(i) for i in item_list] diff --git a/newshomepages/utils.py b/newshomepages/utils.py index 017528b3d51..9c08e7de3e5 100644 --- a/newshomepages/utils.py +++ b/newshomepages/utils.py @@ -1,4 +1,5 @@ import csv +import json import re import tempfile import time @@ -41,6 +42,14 @@ def safe_ia_handle(s): return s +def write_json(data: typing.Any, path: Path, indent: int = 2): + """Write JSON data to the provided path.""" + path.parent.mkdir(parents=True, exist_ok=True) + print(f"📥 Writing JSON to {path}") + with open(path, "w") as fh: + json.dump(data, fh, indent=2) + + def parse_archive_url(url: str): """Parse the handle and timestamp from an archive.org URL.""" o = urlparse(url) diff --git a/tests/test_extract.py b/tests/test_extract.py new file mode 100644 index 00000000000..bdad76302e8 --- /dev/null +++ b/tests/test_extract.py @@ -0,0 +1,10 @@ +from click.testing import CliRunner + +from newshomepages.extract import cli + + +def test_item(tmp_path): + """Test a site's item download.""" + runner = CliRunner() + result = runner.invoke(cli, ["items", "--site=latimes", f"-o={tmp_path}"]) + assert result.exit_code == 0