-
Notifications
You must be signed in to change notification settings - Fork 0
/
Wikipedia.py
60 lines (49 loc) · 2.03 KB
/
Wikipedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import lxml
import SearchGoogle
class Wikipedia(object):
def __init__(self, query, results_per_page=1, search_location="any"):
self.site = 'en.wikipedia.org'
self.results = []
self.child_result = []
self.parser = lxml.etree.HTMLParser()
self.query = query
self.sg = SearchGoogle.SearchGoogle()
self.sg.results_per_page = results_per_page
self.sg.search_location = search_location
self.sg.site = self.site
def extract(self):
urls = self.sg.get_results(self.query)['url']
for url in urls:
intermed_results = []
opurl = self.sg.open_url(url)
doc_tree = lxml.etree.parse(opurl, self.parser)
div = doc_tree.xpath("//div[@id='bodyContent']")[0]
divchildren = div.getchildren()
self.child_result = []
current = []
for elm in divchildren:
if elm.tag == "h2":
self.child_result.append(current)
current = []
current.append(elm.getchildren()[1])
if elm.tag == "p" or elm.tag == "ul":
current.append(elm)
if elm.tag == "h3":
current.append(elm.getchildren()[1])
for li in self.child_result:
abstract = ""
for text in li:
if text.tag == "span":
abstract += "<%s>: " % text.text
else:
textit = text.itertext()
try:
while textit:
abstract += textit.next()
except StopIteration:
pass
intermed_results.append(abstract)
#FIXME: The intermediate results should be filtered,
# only return introduction and sections related to symptoms?
self.results.append(intermed_results)
return {self.site: self.results}