Skip to content

Commit

Permalink
Add ads.txt download CLI
Browse files Browse the repository at this point in the history
  • Loading branch information
asg017 committed Oct 21, 2023
1 parent 40094c0 commit 7c2ab89
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 0 deletions.
86 changes: 86 additions & 0 deletions newshomepages/adstxt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from __future__ import annotations

from pathlib import Path
from urllib.parse import urlparse

import click
import requests
from retry import retry
from rich import print

from . import utils


@click.command()
@click.argument("handle")
@click.option("-o", "--output-dir", "output_dir", default="./")
@click.option("--timeout", "timeout", default="5")
@click.option("--verbose", "verbose", default=False, is_flag=True)
def cli(handle: str, output_dir: str, timeout: str = "5", verbose: bool = False):
"""Save the raw ads.txt of the provided site."""
# Get the site
site = utils.get_site(handle)

# Get the ads.txt
adstxt = _get_adstxt(site["url"], int(timeout), verbose=verbose)

if adstxt is None:
# If there is no ads.txt, we drop out now
print(f":robot: No ads.txt for {handle}")
adstxt = "404: No file found"

# Set the output path
output_path = Path(output_dir) / f"{utils.safe_ia_handle(handle)}.ads.txt"
output_path.parent.mkdir(parents=True, exist_ok=True)

# Write it out
if verbose:
print(f":robot: Writing {output_path}")
with output_path.open("w") as f:
f.write(adstxt)


@retry(tries=3, delay=15, backoff=2)
def _get_adstxt(
site_url: str,
timeout: int = 5,
verbose: bool = False,
) -> str | None:
"""Get the raw ads.txt for a site."""
# Create the ads.txt URL
adstxt_url = (
urlparse(site_url)
._replace(path="")
._replace(query="")
._replace(path="ads.txt")
.geturl()
)
if verbose:
print(f":robot: Fetching {adstxt_url}")

# Set the headers
headers = {"User-Agent": utils.get_user_agent()}

# Make the request
r = requests.get(adstxt_url, timeout=timeout, headers=headers)

# Check if the request is a 404
if r.status_code == 404:
# In this case, there is no ads.txt
# so we return None
return None
else:
# Otherwise, we return the text,
# after checking that the request was successful
try:
assert r.ok
except AssertionError:
msg = f"Request failed with status code {r.status_code}"
if verbose:
print(msg)
raise AssertionError(msg)
return r.text


if __name__ == "__main__":
cli()
12 changes: 12 additions & 0 deletions tests/test_adstxt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pytest
from click.testing import CliRunner

from newshomepages import adstxt


@pytest.mark.vcr()
def test_adstxt_cli(tmp_path):
"""Test a single ads.txt request."""
runner = CliRunner()
result = runner.invoke(adstxt.cli, ["latimes", "-o", tmp_path])
assert result.exit_code == 0

0 comments on commit 7c2ab89

Please sign in to comment.