Generate handle on the fly from the twitter column using our safe fun…

…ction (#422) * Generate handle on the fly from the twitter column using our safe function * Lowercase the twitter handles and resort the sheet
palewire · Oct 27, 2023 · 654d9dd · 654d9dd
1 parent cac62d4
commit 654d9dd
Show file tree

Hide file tree

Showing 35 changed files with 743 additions and 762 deletions.
diff --git a/_site/_templates/accessibility.md.tmpl b/_site/_templates/accessibility.md.tmpl
@@ -155,7 +155,7 @@ Here's a ranking of all sites from best to worst.
   <tr>
     <td>{{ obj.accessibility_rank }}</td>
     <td>
-        <a href="https://palewi.re/docs/news-homepages/sites/{{ obj.handle.lower() }}.html">
+        <a href="https://palewi.re/docs/news-homepages/sites/{{ obj.handle }}.html">
             {{ obj.name }}
         </a>
     </td>

diff --git a/_site/_templates/bundle_detail.md.tmpl b/_site/_templates/bundle_detail.md.tmpl
@@ -12,12 +12,12 @@ The most recent homepages from {{ site_list|length }} news sites in this bundle.
 <div class="latest-parent">
 {% for obj in site_list %}
 <div class="latest-child">
- <a href="https://palewi.re/docs/news-homepages/sites/{{ obj.handle.lower() }}.html">
-  <img src="https://archive.org/download/latest-homepages/{{ obj.handle|lower }}.jpg"
+ <a href="https://palewi.re/docs/news-homepages/sites/{{ obj.handle }}.html">
+  <img src="https://archive.org/download/latest-homepages/{{ obj.handle }}.jpg"
      alt="{{ obj.name }}"
      loading="lazy">
  </a>
- <p><a href="../sites/{{ obj.handle.lower() }}.html">{{ obj.name }}</a></p>
+ <p><a href="../sites/{{ obj.handle }}.html">{{ obj.name }}</a></p>
 </div>
 {% endfor %}
 </div>
@@ -36,5 +36,5 @@ The most recent homepages from {{ site_list|length }} news sites in this bundle.
 | Site  | Latest screenshot | RSS |
 | :---- | :---------------: | :-- |
 {% for obj in site_list -%}
-|[{{ obj.name }}](https://palewi.re/docs/news-homepages/sites/{{ obj.handle.lower() }}.html)|[🔗](https://raw.githubusercontent.com/palewire/news-homepages/main/latest-screenshots/{{ obj.handle|lower }}.jpg)|[🔗](https://palewi.re/docs/news-homepages/rss/sites/{{ obj.handle.lower() }}.xml)|
+|[{{ obj.name }}](https://palewi.re/docs/news-homepages/sites/{{ obj.handle }}.html)|[🔗](https://raw.githubusercontent.com/palewire/news-homepages/main/latest-screenshots/{{ obj.handle }}.jpg)|[🔗](https://palewi.re/docs/news-homepages/rss/sites/{{ obj.handle }}.xml)|
 {% endfor %}
diff --git a/_site/_templates/country_detail.md.tmpl b/_site/_templates/country_detail.md.tmpl
@@ -12,12 +12,12 @@ The most recent homepages from {{ site_list|length }} news sites in this country
 <div class="latest-parent">
 {% for obj in site_list %}
 <div class="latest-child">
- <a href="https://palewi.re/docs/news-homepages/sites/{{ obj.handle.lower() }}.html">
-  <img src="https://palewi.re/news-homepages/latest-screenshots/{{ obj.handle|lower }}.jpg"
+ <a href="https://palewi.re/docs/news-homepages/sites/{{ obj.handle }}.html">
+  <img src="https://palewi.re/news-homepages/latest-screenshots/{{ obj.handle }}.jpg"
      alt="{{ obj.name }}"
      loading="lazy">
  </a>
- <p><a href="../sites/{{ obj.handle.lower() }}.html">{{ obj.name }}</a></p>
+ <p><a href="../sites/{{ obj.handle }}.html">{{ obj.name }}</a></p>
 </div>
 {% endfor %}
 </div>
@@ -35,5 +35,5 @@ The most recent homepages from {{ site_list|length }} news sites in this country
 | Site  | Latest screenshot | RSS |
 | :---- | :---------------: | :-- |
 {% for obj in site_list -%}
-|[{{ obj.name }}](https://palewi.re/docs/news-homepages/sites/{{ obj.handle.lower() }}.html)|[🔗](https://raw.githubusercontent.com/palewire/news-homepages/main/latest-screenshots/{{ obj.handle|lower }}.jpg)|[🔗](https://palewi.re/docs/news-homepages/rss/sites/{{ obj.handle.lower() }}.xml)|
+|[{{ obj.name }}](https://palewi.re/docs/news-homepages/sites/{{ obj.handle }}.html)|[🔗](https://raw.githubusercontent.com/palewire/news-homepages/main/latest-screenshots/{{ obj.handle }}.jpg)|[🔗](https://palewi.re/docs/news-homepages/rss/sites/{{ obj.handle }}.xml)|
 {% endfor %}
diff --git a/_site/_templates/language_detail.md.tmpl b/_site/_templates/language_detail.md.tmpl
@@ -12,12 +12,12 @@ The most recent homepages from {{ site_list|length }} news sites in this languag
 <div class="latest-parent">
 {% for obj in site_list %}
 <div class="latest-child">
- <a href="https://palewi.re/docs/news-homepages/sites/{{ obj.handle.lower() }}.html">
-  <img src="https://palewi.re/news-homepages/latest-screenshots/{{ obj.handle|lower }}.jpg"
+ <a href="https://palewi.re/docs/news-homepages/sites/{{ obj.handle }}.html">
+  <img src="https://palewi.re/news-homepages/latest-screenshots/{{ obj.handle }}.jpg"
      alt="{{ obj.name }}"
      loading="lazy">
  </a>
- <p><a href="../sites/{{ obj.handle.lower() }}.html">{{ obj.name }}</a></p>
+ <p><a href="../sites/{{ obj.handle }}.html">{{ obj.name }}</a></p>
 </div>
 {% endfor %}
 </div>
@@ -33,5 +33,5 @@ The most recent homepages from {{ site_list|length }} news sites in this languag
 | Site  | Latest screenshot | RSS |
 | :---- | :---------------: | :-- |
 {% for obj in site_list -%}
-|[{{ obj.name }}](https://palewi.re/docs/news-homepages/sites/{{ obj.handle.lower() }}.html)|[🔗](https://raw.githubusercontent.com/palewire/news-homepages/main/latest-screenshots/{{ obj.handle|lower }}.jpg)|[🔗](https://palewi.re/docs/news-homepages/rss/sites/{{ obj.handle.lower() }}.xml)|
+|[{{ obj.name }}](https://palewi.re/docs/news-homepages/sites/{{ obj.handle }}.html)|[🔗](https://raw.githubusercontent.com/palewire/news-homepages/main/latest-screenshots/{{ obj.handle }}.jpg)|[🔗](https://palewi.re/docs/news-homepages/rss/sites/{{ obj.handle }}.xml)|
 {% endfor %}
diff --git a/_site/_templates/latest.md.tmpl b/_site/_templates/latest.md.tmpl
@@ -8,12 +8,12 @@ The most recent homepages captured from {{ site_list|length }} news sites.
 <div class="latest-parent">
 {% for obj in site_list %}
 <div class="latest-child">
- <a href="https://palewi.re/docs/news-homepages/sites/{{ obj.handle.lower() }}.html">
-  <img src="https://archive.org/download/latest-homepages/{{ obj.handle|lower }}.jpg"
+ <a href="https://palewi.re/docs/news-homepages/sites/{{ obj.handle }}.html">
+  <img src="https://archive.org/download/latest-homepages/{{ obj.handle }}.jpg"
      alt="{{ obj.name }}"
      loading="lazy">
  </a>
- <p><a href="./sites/{{ obj.handle.lower() }}.html">{{ obj.name }}</a></p>
+ <p><a href="./sites/{{ obj.handle }}.html">{{ obj.name }}</a></p>
 </div>
 {% endfor %}
 </div>
diff --git a/_site/_templates/performance.md.tmpl b/_site/_templates/performance.md.tmpl
@@ -155,7 +155,7 @@ Here's a ranking of all sites from best to worst:
   <tr>
     <td>{{ obj.performance_rank }}</td>
     <td>
-        <a href="./sites/{{ obj.handle.lower() }}.html">
+        <a href="./sites/{{ obj.handle }}.html">
             {{ obj.name }}
         </a>
     </td>

diff --git a/_site/_templates/sites.opml.tmpl b/_site/_templates/sites.opml.tmpl
@@ -6,7 +6,7 @@
         <outline title="Sites" text="Sites">
             <outline text="All sites" title="All sites" type="rss" xmlUrl="https://palewi.re/docs/news-homepages/rss/sites/all.xml"/>
             {%- for site in site_list -%}
-            <outline text="{{ site.name }}" title="{{ site.name }}" type="rss" xmlUrl="https://palewi.re/docs/news-homepages/rss/sites/{{ site.handle.lower() }}.xml"/>
+            <outline text="{{ site.name }}" title="{{ site.name }}" type="rss" xmlUrl="https://palewi.re/docs/news-homepages/rss/sites/{{ site.handle }}.xml"/>
             {% endfor %}
         </outline>
     </body>

diff --git a/_site/_templates/sources.md.tmpl b/_site/_templates/sources.md.tmpl
@@ -10,5 +10,5 @@ A full RSS feed is at [all.xml](https://palewi.re/docs/news-homepages/rss/sites/
 | Site  | Latest screenshot | RSS |
 | :---- | :---------------: | :-- |
 {% for obj in site_list -%}
-|[{{ obj.name }}](https://palewi.re/docs/news-homepages/sites/{{ obj.handle.lower() }}.html)|[🔗](https://raw.githubusercontent.com/palewire/news-homepages/main/latest-screenshots/{{ obj.handle|lower }}.jpg)|[🔗](https://palewi.re/docs/news-homepages/rss/sites/{{ obj.handle.lower() }}.xml)|
+|[{{ obj.name }}](https://palewi.re/docs/news-homepages/sites/{{ obj.handle }}.html)|[🔗](https://raw.githubusercontent.com/palewire/news-homepages/main/latest-screenshots/{{ obj.handle }}.jpg)|[🔗](https://palewi.re/docs/news-homepages/rss/sites/{{ obj.handle }}.xml)|
 {% endfor %}
diff --git a/_site/extracts.md b/_site/extracts.md
@@ -18,7 +18,7 @@ URL: [archive.org/download/news-homepages-extracts/sites.csv](https://archive.or
 
 Field | Description
 :---- | :----------
-`handle` | The Twitter handle of the outlet. A unique identifier
+`handle` | The unique handle of the outlet. A unique identifier
 `name` | The name of the outlet
 `url` | The URL of the homepage
 `location` | The city where the site is based
@@ -59,7 +59,7 @@ URL: [archive.org/download/news-homepages-extracts/items.csv](https://archive.or
 Field | Description
 :---- | :----------
 `identifier` | The unique identifier created by Internet Archive
-`handle` | The Twitter handle of the outlet. Can be used to merge with other files
+`handle` | The unique handle of the outlet. Can be used to merge with other files
 `file_name` | The name of the file on [GitHub](https://github.com/palewire/news-homepages/tree/main/extracts/json) where the Internet Archive metadata is stored.
 `url` | The URL on archive.org where you can find the item
 `title` | The title of the item on Internet Archive
@@ -76,7 +76,7 @@ URL: [archive.org/download/news-homepages-extracts/screenshot-files.csv](https:/
 Field | Description
 :---- | :----------
 `identifier` | The unique identifier created by Internet Archive
-`handle` | The Twitter handle of the outlet. Can be used to merge with other files
+`handle` | The unique handle of the outlet. Can be used to merge with other files
 `file_name` | The name of the file in the Internet Archive
 `url` | The URL of the file
 `mtime` | The time the file was last modified by the Internet Archive in UTC time
@@ -94,7 +94,7 @@ URL: [archive.org/download/news-homepages-extracts/accessibility-files.csv](http
 Field | Description
 :---- | :----------
 `identifier` | The unique identifier created by Internet Archive
-`handle` | The Twitter handle of the outlet. Can be used to merge with other files
+`handle` | The unique handle of the outlet. Can be used to merge with other files
 `file_name` | The name of the file in the Internet Archive
 `url` | The URL of the file
 `mtime` | The time the file was last modified by the Internet Archive in UTC time
@@ -111,7 +111,7 @@ URL: [archive.org/download/news-homepages-extracts/hyperlink-files.csv](https://
 Field | Description
 :---- | :----------
 `identifier` | The unique identifier created by Internet Archive
-`handle` | The Twitter handle of the outlet. Can be used to merge with other files
+`handle` | The unique handle of the outlet. Can be used to merge with other files
 `file_name` | The name of the file in the Internet Archive
 `url` | The URL of the file
 `mtime` | The time the file was last modified by the Internet Archive in UTC time
@@ -128,7 +128,7 @@ URL: [archive.org/download/news-homepages-extracts/lighthouse-files.csv](https:/
 Field | Description
 :---- | :----------
 `identifier` | The unique identifier created by Internet Archive
-`handle` | The Twitter handle of the outlet. Can be used to merge with other files
+`handle` | The unique handle of the outlet. Can be used to merge with other files
 `file_name` | The name of the file in the Internet Archive
 `url` | The URL of the file
 `mtime` | The time the file was last modified by the Internet Archive in UTC time
@@ -145,7 +145,7 @@ URL: [archive.org/download/news-homepages-extracts/lighthouse-sample.csv](https:
 Field | Description
 :---- | :----------
 `identifier` | The unique identifier created by Internet Archive
-`handle` | The Twitter handle of the outlet. Can be used to merge with other files
+`handle` | The unique handle of the outlet. Can be used to merge with other files
 `file_name` | The name of the file in the Internet Archive
 `date` | The datetime when the audit was captured
 `performance` | Lighthouse's [performance](https://developer.chrome.com/docs/lighthouse/performance/) metric score
@@ -162,7 +162,7 @@ URL: [archive.org/download/news-homepages-extracts/lighthouse-analysis.csv](http
 
 Field | Description
 :---- | :----------
-`handle` | The Twitter handle of the outlet. Can be used to merge with other files
+`handle` | The unique handle of the outlet. Can be used to merge with other files
 `performance_count` | The number of Lighthouse [performance](https://developer.chrome.com/docs/lighthouse/performance/) metric observations
 `performance_median` | The median performance metric score
 `performance_mean` | The average performance metric score
@@ -205,7 +205,7 @@ URL: [archive.org/download/news-homepages-extracts/robotstxt-files.csv](https://
 Field | Description
 :---- | :----------
 `identifier` |The unique identifier for the Internet Archive item
-`handle` | The Twitter handle of the outlet. Can be used to merge with other files
+`handle` | The unique handle of the outlet. Can be used to merge with other files
 `file_name` | The name of the file in the Internet Archive item
 `url` | The URL of the file
 `mtime` | The time the file was last modified by the Internet Archive in UTC time
@@ -222,7 +222,7 @@ URL: [archive.org/download/news-homepages-extracts/robotstxt-sample.csv](https:/
 Field | Description
 :---- | :----------
 `identifier` | The unique identifier for the Internet Archive item
-`handle` | The Twitter handle of the outlet. Can be used to merge with other files
+`handle` | The unique handle of the outlet. Can be used to merge with other files
 `date` | The date when the file was captured
 `url` | The URL to archived file on archive.org
 `user_agent` | A user agent declared in the robots.txt file
@@ -237,7 +237,7 @@ URL: [archive.org/download/news-homepages-extracts/wayback-files.csv](https://ar
 Field | Description
 :---- | :----------
 `identifier` | The unique identifier created by Internet Archive
-`handle` | The Twitter handle of the outlet. Can be used to merge with other files
+`handle` | The unique handle of the outlet. Can be used to merge with other files
 `file_name` | The name of the file in the Internet Archive
 `url` | The URL of the file
 `mtime` | The time the file was last modified by the Internet Archive in UTC time
@@ -254,7 +254,7 @@ URL: [archive.org/download/news-homepages-extracts/drudge-hyperlinks-sample.csv]
 Field | Description
 :---- | :----------
 `identifier` | The unique identifier created by Internet Archive
-`handle` | The Twitter handle of the outlet. Can be used to merge with other files
+`handle` | The unique handle of the outlet. Can be used to merge with other files
 `file_name` | The name of the file in the Internet Archive
 `date` | The datetime when the hyperlink was captured
 `text` | The text of the hyperlink

diff --git a/_site/gettingstarted.md b/_site/gettingstarted.md
@@ -28,7 +28,7 @@ Install Chrome for our web scraper.
 pipenv run playwright install --with-deps chromium
 ```
 
-You're ready to work. Try a screenshot with the `screenshot.py` command. As with other commands, it expects you pass in the Twitter handle of the target site. The supported sites are listed in [`newshomepages/sources/sites.csv`](https://github.com/palewire/news-homepages/blob/main/newshomepages/sources/sites.csv). We use them as a unique identifier across the project.
+You're ready to work. Try a screenshot with the `screenshot.py` command. As with other commands, it expects you pass in the unique handle of the target site. The supported sites are listed in [`newshomepages/sources/sites.csv`](https://github.com/palewire/news-homepages/blob/main/newshomepages/sources/sites.csv). We use them as a unique identifier across the project.
 
 ```bash
 pipenv run python -m newshomepages.screenshot latimes

diff --git a/_site/slack.md b/_site/slack.md
@@ -63,7 +63,7 @@ jobs:
           webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_YOUR_SITE_HANDLE_IN_CAPS }}
 ```
 
-You should tailor the `name` setting to match your site. There are two `handle` inputs in the jobs section that should be edited to match your site's Twitter handle.
+You should tailor the `name` setting to match your site. There are two `handle` inputs in the jobs section that should be edited to match your site's unique handle.
 
 You'll also want to configure the `cron` setting to match whatever schedule you'd like the archiver to run on. If you're unfamiliar with the syntax, [GitHub's official documentation](https://futurestud.io/tutorials/github-actions-trigger-builds-on-schedule-cron) offers some guidance.
 

diff --git a/newshomepages/accessibility.py b/newshomepages/accessibility.py
@@ -23,7 +23,7 @@ def cli(handle, output_dir, verbose=False):
     site = utils.get_site(handle)
 
     # Set the output path
-    output_path = Path(output_dir) / f"{site['handle'].lower()}.accessibility.json"
+    output_path = Path(output_dir) / f"{site['handle']}.accessibility.json"
     output_path.parent.mkdir(parents=True, exist_ok=True)
 
     # Do the thing

diff --git a/newshomepages/adstxt.py b/newshomepages/adstxt.py
@@ -26,11 +26,11 @@ def cli(handle: str, output_dir: str, timeout: str = "5", verbose: bool = False)
 
     if adstxt is None:
         # If there is no ads.txt, we drop out now
-        print(f":robot: No ads.txt for {handle}")
+        print(f":robot: No ads.txt for {site['handle']}")
         adstxt = "404: No file found"
 
     # Set the output path
-    output_path = Path(output_dir) / f"{utils.safe_ia_handle(handle)}.ads.txt"
+    output_path = Path(output_dir) / f"{site['handle']}.ads.txt"
     output_path.parent.mkdir(parents=True, exist_ok=True)
 
     # Write it out

diff --git a/newshomepages/archive.py b/newshomepages/archive.py
@@ -67,7 +67,7 @@ def cli(
     # Upload each file into an "item" keyed to the site's handle and year
     handle = data["handle"]
     local_now = utils.get_local_time(data)
-    site_identifier = f"{utils.safe_ia_handle(handle)}-{local_now.strftime('%Y')}"
+    site_identifier = f"{handle}-{local_now.strftime('%Y')}"
     site_metadata = _get_item_metadata(data)
     print(
         f"📚 Saving timestamped `{handle}` assets to archive.org `{IA_COLLECTION}` collection's `{site_identifier}`"
@@ -90,7 +90,7 @@ def cli(
     if not latest:
         return
 
-    image_path = input_path / f"{utils.safe_ia_handle(handle)}.jpg"
+    image_path = input_path / f"{handle}.jpg"
     if not image_path.exists():
         return
 
@@ -105,7 +105,7 @@ def cli(
         publisher="https://homepages.news",
         contributor="https://homepages.news",
     )
-    latest_dict = {f"{utils.safe_ia_handle(handle)}.jpg": image_path}
+    latest_dict = {f"{handle}.jpg": image_path}
     _upload(
         data,
         latest_identifier,
@@ -136,7 +136,7 @@ def _get_item_metadata(data: dict) -> dict:
 def _get_file_dict(data: dict, input_dir: Path) -> dict:
     """Get a dictionary of timestamped files to upload to our archive.org collection."""
     # Set the input paths
-    handle = utils.safe_ia_handle(data["handle"])
+    handle = data["handle"]
     image_path = input_dir / f"{handle}.jpg"
     image_fullpage_path = input_dir / f"{handle}.fullpage.jpg"
     a11y_path = input_dir / f"{handle}.accessibility.json"

diff --git a/newshomepages/batch.py b/newshomepages/batch.py
@@ -42,7 +42,7 @@ def sites_by_country(country: str):
 
 def _dump(site_list: typing.List):
     """Print out the provided site list as JSON."""
-    handle_list = [s["handle"].lower() for s in site_list]
+    handle_list = [s["handle"] for s in site_list]
     data = json.dumps(handle_list, indent=2)
     click.echo(data)
 

diff --git a/newshomepages/extract/accessibility.py b/newshomepages/extract/accessibility.py
@@ -25,13 +25,11 @@ def accessibility(handle):
     accessibility_df = utils.get_accessibility_df()
 
     # Filter it down to files for the provided site
-    site_df = accessibility_df[
-        accessibility_df.handle.str.lower() == site["handle"].lower()
-    ]
+    site_df = accessibility_df[accessibility_df.handle == site["handle"]]
     print(f"{len(site_df)} accessibility files found")
 
     # Read in the output file
-    output_path = utils.THIS_DIR / f"{handle.lower()}-accessibility.csv"
+    output_path = utils.THIS_DIR / f"{site['handle']}-accessibility.csv"
     try:
         output_df = pd.read_csv(output_path)
         downloaded_files = set(output_df.file_url.unique())

diff --git a/newshomepages/extract/consolidate.py b/newshomepages/extract/consolidate.py
@@ -92,7 +92,7 @@ def consolidate(
             p
             for p in item_data["files"]
             if (
-                handle.lower() in p["name"].lower()
+                handle in p["name"].lower()
                 and (
                     p["format"] in ["JSON", "JPEG", "HTML"]
                     or p["name"].lower().endswith("ads.txt")

diff --git a/newshomepages/extract/hyperlinks.py b/newshomepages/extract/hyperlinks.py
@@ -48,8 +48,8 @@ def hyperlinks(
         site_list = utils.get_sites_in_bundle(bundle)
         slug = bundle.lower()
 
-    handle_list = [s["handle"].lower() for s in site_list]
-    filtered_df = hyperlink_df[hyperlink_df.handle.str.lower().isin(handle_list)].copy()
+    handle_list = [s["handle"] for s in site_list]
+    filtered_df = hyperlink_df[hyperlink_df.handle.isin(handle_list)].copy()
 
     if days:
         cutoff_date = filtered_df["date"].max() - pd.Timedelta(days=int(days))