-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCh2_save_to_csv.py
29 lines (24 loc) · 1.03 KB
/
Ch2_save_to_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# author = [email protected]
# -*- coding: cp936 -*-
# coding: cp936
import re
import urllib2
import lxml.html
import csv
import Ch2_link_crawler
class ScrapeCallback:
def __init__(self):
self.writer = csv.writer(open('countries.csv', 'w'))
self.fields = ('area', 'population', 'country', 'capital',
'continent', 'tld', 'currency_code', 'currency_name',
'phone', 'postal_code_format', 'postal_code_regex',
'languages', 'neighbours')
self.writer.writerow(self.fields)
def __call__(self, url, html):
if re.search('/view/', url):
tree = lxml.html.fromstring(html)
row = []
for field in self.fields:
row.append(tree.cssselect('table > tr#places_%s__row > td.w2p_fw' % field)[0].text_content())
self.writer.writerow(row)
Ch2_link_crawler.link_crawler('http://example.webscraping.com', '/(index||view)', max_depth=-1, scrape_callback=ScrapeCallback())