Skip to content

Instantly share code, notes, and snippets.

Last active April 13, 2024 19:16
Show Gist options
  • Save thebristolsound/67de91edeb8da71d564a77ff91bb9d60 to your computer and use it in GitHub Desktop.
Save thebristolsound/67de91edeb8da71d564a77ff91bb9d60 to your computer and use it in GitHub Desktop.
Simple Twitter Archive Scraper (Wayback Machine)
import requests
import json
from bs4 import BeautifulSoup
# To run, just do python3
# Shout out @JordanWildon for the idea
# Main function
if __name__ == "__main__":
# Twitter username (case sensitive, no @ symbol)
user_name = 'realDonaldTrump'
# Base URL of wayback API call (yes it's ugly)
base_url = f'{username}%2F&matchType=prefix&collapse=urlkey&output=json&fl=original%2Cmimetype%2Ctimestamp%2Cendtimestamp%2Cgroupcount%2Cuniqcount&filter=!statuscode%3A%5B45%5D..&limit=100000'
# Fetch JSON results from Wayback Machine timemap
page = requests.get(base_url)
soup = BeautifulSoup(page.text,'html.parser')
archive_json = json.loads(soup.text)
res = []
total = len(archive_json)
print(f'Processing {total} urls from the Wayback Machine')
for archive in archive_json:
archive_url = f'{archive[0]}'
# Convert results to JSOM
print("Saving the tweet urls as JSON")
with open('tweets.json', 'w', encoding='latin-1') as f:
json.dump(res, f, indent=8, ensure_ascii=False)
# write to a flat text file
print("Saving tweet urls as a flat text file."")
text_file = open('tweets.txt', 'w')
for archive_url in res:
text_file.write(archive_url + '\n')
print(f'Finished processing {total} urls from the Wayback Machine.')
Copy link

J url)

# Convert results to JSOM
print("Saving the tweet urls as JSON")
with open('tweets.json', 'w', encoding='latin-1') as f:
    json.dump(res, f, indent=8, ensure_ascii=False)
# write to a flat text file
print("Saving tweet urls as a flat text file."")
text_file = open('tweets

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment