forked from floodsung/Deep-Learning-Papers-Reading-Roadmap
-
Notifications
You must be signed in to change notification settings - Fork 0
/
download.py
135 lines (115 loc) · 4.48 KB
/
download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from __future__ import print_function
import os
import re
from six.moves.urllib.error import HTTPError
import shutil
import argparse
import mistune
import bs4 as BeautifulSoup
import socket
import time
import requests
# encoding=utf8
import sys
try:
reload(sys)
except NameError:
pass
try:
sys.setdefaultencoding('utf8')
except AttributeError:
pass
def download_pdf(link, location, name):
try:
response = requests.get(link)
with open(os.path.join(location, name), 'wb') as f:
f.write(response.content)
f.close()
except HTTPError:
print('>>> Error 404: cannot be downloaded!\n')
raise
except socket.timeout:
print(" ".join(("can't download", link, "due to connection timeout!")) )
raise
def clean_pdf_link(link):
if 'arxiv' in link:
link = link.replace('abs', 'pdf')
if not(link.endswith('.pdf')):
link = '.'.join((link, 'pdf'))
print(link)
return link
def clean_text(text, replacements = {':': '_', ' ': '_', '/': '_', '.': '', '"': ''}):
for key, rep in replacements.items():
text = text.replace(key, rep)
return text
def print_title(title, pattern = "-"):
print('\n'.join(("", title, pattern * len(title))))
def get_extension(link):
extension = os.path.splitext(link)[1][1:]
if extension in ['pdf', 'html']:
return extension
if 'pdf' in extension:
return 'pdf'
return 'pdf'
def shorten_title(title):
m1 = re.search('[[0-9]*]', title)
m2 = re.search('".*"', title)
if m1:
title = m1.group(0)
if m2:
title = ' '.join((title, m2.group(0)))
return title[:50] + ' [...]'
if __name__ == '__main__':
parser = argparse.ArgumentParser(description = 'Download all the PDF/HTML links into README.md')
parser.add_argument('-d', action="store", dest="directory")
parser.add_argument('--no-html', action="store_true", dest="nohtml", default = False)
parser.add_argument('--overwrite', action="store_true", default = False)
results = parser.parse_args()
output_directory = 'pdfs' if results.directory is None else results.directory
forbidden_extensions = ['html', 'htm'] if results.nohtml else []
if results.overwrite and os.path.exists(output_directory):
shutil.rmtree(output_directory)
with open('README.md',encoding='utf8) as readme:
readme_html = mistune.markdown(readme.read())
readme_soup = BeautifulSoup.BeautifulSoup(readme_html, "html.parser")
point = readme_soup.find_all('h1')[1]
failures = []
while point is not None:
if point.name:
if re.search('h[1-2]', point.name):
if point.name == 'h1':
h1_directory = os.path.join(output_directory, clean_text(point.text))
current_directory = h1_directory
elif point.name == 'h2':
current_directory = os.path.join(h1_directory, clean_text(point.text))
if not os.path.exists(current_directory):
os.makedirs(current_directory)
print_title(point.text)
if point.name == 'p':
link = point.find('a')
if link is not None:
link = clean_pdf_link(link.attrs['href'])
ext = get_extension(link)
print(ext)
if not ext in forbidden_extensions:
print(shorten_title(point.text) + ' (' + link + ')')
try:
name = clean_text(point.text.split('[' + ext + ']')[0])
fullname = '.'.join((name, ext))
if not os.path.exists('/'.join((current_directory, fullname)) ):
download_pdf(link, current_directory, '.'.join((name, ext)))
except KeyboardInterrupt:
try:
print("Press Ctrl-C in 1 second to quit")
time.sleep(1)
except KeyboardInterrupt:
print("Cancelling..")
break
except:
failures.append(point.text)
point = point.next_sibling
print('Done!')
if failures:
print('Some downloads have failed:')
for fail in failures:
print('> ' + fail)