forked from prashanti/concept-drift-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
GetBHLCorpus.py
122 lines (101 loc) · 3.49 KB
/
GetBHLCorpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def getTitleIDs():
global titleids
inp=open('title_parsed.xls','r')
for line in inp:
if "TitleID" not in line:
data=line.split("\t")
titleid=data[0].strip()
language=data[1].strip()
titleids.add(titleid.strip())
def getItemIDs():
global titleids
global itemids_dict
inp=open('item_parsed.xls','r')
for line in inp:
if "TitleID" not in line:
data=line.split("\t")
itemid=data[0].strip()
titleid=data[1].strip()
if titleid not in itemids_dict:
itemids_dict[titleid]=set()
itemids_dict[titleid].add(itemid)
def getVertebrateTitleIDs():
global vertebratetitleids
inp=open('subject.txt','r')
for line in inp:
if "TitleID" not in line:
data=line.split("\t")
#TitleID Subject CreationDate
titleid=data[0]
subject=data[1]
if "Vertebrate" in subject:
vertebratetitleids.add(titleid.strip())
def getVertebratePartIDs():
global vertebratepartids
inp=open('part_parsed.xls','r')
#PartID ItemID ContributorName SequenceOrder SegmentType Title ContainerTitle PublicationDetails Volume Series Issue Date PageRange StartPageID LanguageName SegmentUrl ExternalUrl DownloadUrl
for line in inp:
if "PartID" not in line:
data=line.split("\t")
partid=data[0].strip()
title=data[2].strip()
if "Vertebrate" in title:
vertebratepartids.add(partid)
def getPageIDsforPartId():
global vertebratepartids
global partid2pageids_dict
for partid in vertebratepartids:
url="http://www.biodiversitylibrary.org/api2/httpquery.ashx?op=GetPartMetadata&partid="+partid+"&pages=t&ocr=t&apikey=8c118b05-3e6e-4ef2-92c5-78610a868a14&format=json"
content = urllib2.urlopen(url).read()
raw = nltk.clean_html(content)
decoded_data = json.loads(raw)
# keep a set of pageIDs for each partid
for x in decoded_data['Result']['Pages']:
if partid not in partid2pageids_dict:
partid2pageids_dict[partid]=set()
partid2pageids_dict[partid].add(x['PageID'])
def main():
subjectlimit=0
partlimit=10
global titleids
global itemids_dict
getTitleIDs()
getItemIDs()
getVertebrateTitleIDs()
getVertebratePartIDs()
getPageIDsforPartId()
f=open('BHLCorpus.txt','w')
for titleid in vertebratetitleids:
for item in itemids_dict[titleid]:
if subjectlimit >0:
f.write("TitleID: "+str(titleid)+"\n")
subjectlimit=subjectlimit-1
f.write("ItemID: "+str(item)+"\n")
url="http://www.biodiversitylibrary.org/api2/httpquery.ashx?op=GetItemMetadata&itemid="+item.strip()+"&pages=t&ocr=t&parts=f&apikey=8c118b05-3e6e-4ef2-92c5-78610a868a14&format=json"
content = urllib2.urlopen(url).read()
raw = nltk.clean_html(content)
decoded_data = json.loads(raw)
for page in decoded_data['Result']['Pages']:
f.write(page['OcrText'].encode('utf-8').strip()+"\n")
f.write("Getting Parts which match Vertebrate in the title")
for partid in partid2pageids_dict:
if partlimit>0:
partlimit=partlimit-1
f.write("Part ID: "+str(partid)+"\n")
for pageid in partid2pageids_dict[partid]:
url ="http://www.biodiversitylibrary.org/api2/httpquery.ashx?op=GetPageOcrText&pageid="+str(pageid)+"&apikey=8c118b05-3e6e-4ef2-92c5-78610a868a14&format=json"
content = urllib2.urlopen(url).read()
raw = nltk.clean_html(content)
decoded_data = json.loads(raw)
f.write(decoded_data['Result'].encode('utf-8'))
if __name__ == "__main__":
import json
import urllib2
from BeautifulSoup import BeautifulSoup
import nltk
titleids=set()
itemids_dict=dict()
vertebratetitleids=set()
vertebratepartids=set()
partid2pageids_dict=dict()
main()