Last active
April 13, 2024 19:16
-
-
Save thebristolsound/67de91edeb8da71d564a77ff91bb9d60 to your computer and use it in GitHub Desktop.
Simple Twitter Archive Scraper (Wayback Machine)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
from bs4 import BeautifulSoup | |
# To run, just do python3 twitter_wayback_scrape.py | |
# Shout out @JordanWildon for the idea https://archive.ph/xEGPF | |
# Main function | |
if __name__ == "__main__": | |
# Twitter username (case sensitive, no @ symbol) | |
user_name = 'realDonaldTrump' | |
# Base URL of wayback API call (yes it's ugly) | |
base_url = f'https://web.archive.org/web/timemap/?url=https%3A%2F%2Ftwitter.com%2{username}%2F&matchType=prefix&collapse=urlkey&output=json&fl=original%2Cmimetype%2Ctimestamp%2Cendtimestamp%2Cgroupcount%2Cuniqcount&filter=!statuscode%3A%5B45%5D..&limit=100000' | |
# Fetch JSON results from Wayback Machine timemap | |
page = requests.get(base_url) | |
soup = BeautifulSoup(page.text,'html.parser') | |
archive_json = json.loads(soup.text) | |
res = [] | |
total = len(archive_json) | |
print(f'Processing {total} urls from the Wayback Machine') | |
for archive in archive_json: | |
archive_url = f'https://web.archive.org/web/0/{archive[0]}' | |
res.append(archive_url) | |
# Convert results to JSOM | |
print("Saving the tweet urls as JSON") | |
with open('tweets.json', 'w', encoding='latin-1') as f: | |
json.dump(res, f, indent=8, ensure_ascii=False) | |
# write to a flat text file | |
print("Saving tweet urls as a flat text file."") | |
text_file = open('tweets.txt', 'w') | |
for archive_url in res: | |
text_file.write(archive_url + '\n') | |
text_file.close() | |
print(f'Finished processing {total} urls from the Wayback Machine.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
J url)