From b4e88b2c0e934507f0eb23c6e7bd86d246189150 Mon Sep 17 00:00:00 2001 From: Moh Tur Date: Sat, 18 Jul 2020 00:28:48 -0400 Subject: [PATCH] Change User-Agent if it's default for IA & IS --- archivenow/__init__.py | 2 +- archivenow/archivenow.py | 2 +- archivenow/handlers/ia_handler.py | 6 ++++-- archivenow/handlers/is_handler.py | 19 ++++++++++++++----- archivenow/handlers/mg_handler.py | 4 +++- archivenow/handlers/st_handler.py | 2 +- 6 files changed, 24 insertions(+), 11 deletions(-) diff --git a/archivenow/__init__.py b/archivenow/__init__.py index 49afa2e..044c68b 100755 --- a/archivenow/__init__.py +++ b/archivenow/__init__.py @@ -1 +1 @@ -__version__ = '2020.4.1.10.34.36' \ No newline at end of file +__version__ = '2020.7.18.12.19.44' \ No newline at end of file diff --git a/archivenow/archivenow.py b/archivenow/archivenow.py index b188481..6437bb1 100755 --- a/archivenow/archivenow.py +++ b/archivenow/archivenow.py @@ -15,7 +15,7 @@ #from __init__ import __version__ as archiveNowVersion -archiveNowVersion = '2020.4.1.10.34.36' +archiveNowVersion = '2020.7.18.12.19.44' # archive handlers path PATH = Path(os.path.dirname(os.path.abspath(__file__))) diff --git a/archivenow/handlers/ia_handler.py b/archivenow/handlers/ia_handler.py index e2b282f..5880bb8 100644 --- a/archivenow/handlers/ia_handler.py +++ b/archivenow/handlers/ia_handler.py @@ -11,11 +11,13 @@ def push(self, uri_org, p_args=[], session=requests.Session()): msg = '' try: uri = 'https://web.archive.org/save/' + uri_org - archiveTodayUserAgent = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"} + archiveTodayUserAgent = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36" + } # push into the archive # r = session.get(uri, timeout=120, allow_redirects=True, headers=archiveTodayUserAgent) - if 'user-agent' in session.headers: + if ('user-agent' in session.headers) and (not session.headers['User-Agent'].lower().startswith('python-requests/')): r = session.get(uri, timeout=120, allow_redirects=True) else: r = session.get(uri, timeout=120, allow_redirects=True, headers=archiveTodayUserAgent) diff --git a/archivenow/handlers/is_handler.py b/archivenow/handlers/is_handler.py index d48e781..7acdd9f 100644 --- a/archivenow/handlers/is_handler.py +++ b/archivenow/handlers/is_handler.py @@ -39,9 +39,11 @@ def push(self, uri_org, p_args=[], session=requests.Session()): msg = '' - archiveTodayUserAgent = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)" , "host": host} - - if 'user-agent' in session.headers: + archiveTodayUserAgent = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36", + "host": host} + + if ('user-agent' in session.headers) and (not session.headers['User-Agent'].lower().startswith('python-requests/')): rid = session.get('https://'+host+'/',timeout=120, allow_redirects=True) else: rid = session.get('https://'+host+'/',timeout=120, allow_redirects=True, headers=archiveTodayUserAgent) @@ -55,10 +57,17 @@ def push(self, uri_org, p_args=[], session=requests.Session()): msg = "IndexError (" + self.name+ "): unable to extract 'submitid' " raise - # push to the archive - r = session.post('https://'+host+'/submit/', timeout=120, + # push to the archive + if ('user-agent' in session.headers) and (not session.headers['User-Agent'].lower().startswith('python-requests/')): + r = session.post('https://'+host+'/submit/', timeout=120, data={"anyway":"1" , "url":uri_org, "submitid":archiveTodaySubmitId}, allow_redirects=True) + else: + r = session.post('https://'+host+'/submit/', timeout=120, + data={"anyway":"1" , "url":uri_org, "submitid":archiveTodaySubmitId}, + allow_redirects=True, + headers=archiveTodayUserAgent) + r.raise_for_status() # extract the link to the archived copy diff --git a/archivenow/handlers/mg_handler.py b/archivenow/handlers/mg_handler.py index c6a444b..66c1eba 100644 --- a/archivenow/handlers/mg_handler.py +++ b/archivenow/handlers/mg_handler.py @@ -4,7 +4,9 @@ -new_header = 'Mozilla/5.0 (X11; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0' +new_header = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' + + class MG_handler(object): diff --git a/archivenow/handlers/st_handler.py b/archivenow/handlers/st_handler.py index 30cfb5b..41fee1f 100644 --- a/archivenow/handlers/st_handler.py +++ b/archivenow/handlers/st_handler.py @@ -2,7 +2,7 @@ import requests import re -new_header = 'Mozilla/5.0 (X11; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0' +new_header = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' headers = {'User-Agent': new_header}