forked from ZachisGit/ipfs-arxiv
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharxiv_scraper.py
87 lines (70 loc) · 2.87 KB
/
arxiv_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import requests
from xml.etree import ElementTree
import os
import json
# Retrive
def get_arxiv(start,length=100,query='all:machine%20learning'):
try:
resp = requests.get('http://export.arxiv.org/api/query?search_query='+query+'&start='+str(start)+'&max_results='+str(length))
return resp.content
except:
print 'error'
return None
# Create Xml document from raw string
def xml(content):
root = ElementTree.fromstring(content)
return root
# Returns an array of => [id, title, summary]
# where http://arxiv.org/pdf/[id] is the pdf download link
def get_entries(xml_root):
entries = []
for r in xml_root.findall('{http://www.w3.org/2005/Atom}entry'):
id = r.find('{http://www.w3.org/2005/Atom}id').text.replace('http://arxiv.org/abs/','')
title = r.find('{http://www.w3.org/2005/Atom}title').text
summary = r.find('{http://www.w3.org/2005/Atom}summary').text
entries.append({'id':id,'title':title,'summary':summary})
return entries
# Writes the entries to the json index file + Removes the doubles from the new entries
def write_entries_to_index(entries,index_file='index.json'):
json_entries = []
if os.path.isfile(index_file):
with open(index_file,'r') as json_file:
json_entries = json.loads(json_file.read())
# Add to json entries + Prevent doubles from the entry list
existing_ids = [entry['id'] for entry in json_entries]
new_entries = []
for e in entries:
if e['id'] in existing_ids:
continue
json_entries.append(e)
new_entries.append(e)
with open(index_file,'w') as json_file:
json_file.write(json.dumps(json_entries))
return new_entries
# Downloads the pdfs from the entry ids to a folder
def download_pdfs(entries,folder='pdfs'):
if not os.path.isdir(folder):
os.mkdir(folder)
for e in entries:
try:
print '[Downloading... ' + e['id'] + ']'
resp = requests.get('https://arxiv.org/pdf/'+e['id']+'.pdf')
# If no pdf exists, continue
if resp.headers['Content-Type'] != 'application/pdf':
print 'Not PDF'
continue
with open(folder+'/'+e['id']+'.pdf', 'wb') as file:
file.write(resp.content)
except:
print '[PDF-Download-Error: id='+e['id']+' could not be downloaded!]'
continue
for i in range(0,1000,100):
print 'Retriving entries [',i,'/',i+100,']'
raw = get_arxiv(i,length=100)
print 'Extracting xml entries...'
xml_root = xml(raw)
entries = get_entries(xml_root)
print 'Writing to json index...'
entries = write_entries_to_index(entries,index_file='index.json')
print 'Downloading PDFs...'
download_pdfs(entries,folder='pdfs')