forked from floodsung/Deep-Learning-Papers-Reading-Roadmap
-
Notifications
You must be signed in to change notification settings - Fork 2
/
download.py
111 lines (93 loc) · 3.89 KB
/
download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import re
import urllib2
import shutil
import argparse
import mistune
import bs4 as BeautifulSoup
def download_pdf(link, location, name):
try:
response = urllib2.urlopen(link, timeout=500)
file = open(os.path.join(location, name), 'w')
file.write(response.read())
file.close()
except urllib2.HTTPError:
print('>>> Error 404: cannot be downloaded!\n')
raise
except socket.timeout:
print(" ".join(("can't download", link, "due to connection timeout!")) )
def clean_pdf_link(link):
if 'arxiv' in link:
link = link.replace('abs', 'pdf')
if not(link.endswith('.pdf')):
link = '.'.join((link, 'pdf'))
if 'github' in link:
link = '.'.join((link, 'html'))
return link
def clean_text(text, replacements = {' ': '_', '/': '_', '.': '', '"': ''}):
for key, rep in replacements.items():
text = text.replace(key, rep)
return text
def print_title(title, pattern = "-"):
print('\n'.join(("", title, pattern * len(title))))
def get_extension(link):
extension = os.path.splitext(link)[1][1:]
if extension in ['pdf', 'html']:
return extension
if 'pdf' in extension:
return 'pdf'
return 'pdf'
def shorten_title(title):
m1 = re.search('[[0-9]*]', title)
m2 = re.search('".*"', title)
if m1:
title = m1.group(0)
if m2:
title = ' '.join((title, m2.group(0)))
return title[:50] + ' [...]'
if __name__ == '__main__':
parser = argparse.ArgumentParser(description = 'Download all the PDF/HTML links into README.md')
parser.add_argument('-d', action="store", dest="directory")
parser.add_argument('--no-html', action="store_true", dest="nohtml", default = False)
parser.add_argument('--overwrite', action="store_true", default = False)
results = parser.parse_args()
output_directory = 'pdfs' if results.directory is None else results.directory
forbidden_extensions = ['html', 'htm'] if results.nohtml else []
if results.overwrite and os.path.exists(output_directory):
shutil.rmtree(output_directory)
with open('README.md') as readme:
readme_html = mistune.markdown(readme.read())
readme_soup = BeautifulSoup.BeautifulSoup(readme_html, "html.parser")
point = readme_soup.find_all('h1')[1]
failures = []
while point is not None:
if point.name:
if re.search('h[1-2]', point.name):
if point.name == 'h1':
h1_directory = os.path.join(output_directory, clean_text(point.text))
current_directory = h1_directory
elif point.name == 'h2':
current_directory = os.path.join(h1_directory, clean_text(point.text))
if not os.path.exists(current_directory):
os.makedirs(current_directory)
print_title(point.text)
if point.name == 'p':
link = point.find('a')
if link is not None:
link = clean_pdf_link(link.attrs['href'])
ext = get_extension(link)
if not ext in forbidden_extensions:
print(shorten_title(point.text) + ' (' + link + ')')
try:
name = clean_text(point.text.split('[' + ext + ']')[0])
fullname = '.'.join((name, ext))
if not os.path.exists('/'.join((current_directory, fullname)) ):
download_pdf(link, current_directory, '.'.join((name, ext)))
except:
failures.append(point.text)
point = point.next_sibling
print('Done!')
if failures:
print('Some downloads have failed:')
for fail in failures:
print('> ' + fail)