forked from awesome-archive/lianjia-scrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjianwei.py
106 lines (96 loc) · 3.19 KB
/
jianwei.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#! usr/bin/python #coding=utf-8
import os
import requests
from bs4 import BeautifulSoup
from peewee import *
BASE_URL = 'http://210.75.213.188/shh/portal/bjjs2016/'
PAGE = 2940
username = 'root'
password = ''
dbname = 'jianwei'
host = '127.0.0.1'
database = MySQLDatabase(
dbname,
host=host,
port=3306,
user=username,
passwd=password,
charset='utf8',
use_unicode=True,
)
class BaseModel(Model):
class Meta:
database = database
class House(BaseModel):
id = PrimaryKeyField()
district = CharField()
name = CharField()
type = CharField()
square = FloatField()
price = FloatField()
agency = CharField()
time = DateField()
url = CharField()
direction = CharField()
floor = CharField()
total_floor = CharField()
year = IntegerField()
decoration = CharField()
def database_init():
database.connect()
database.create_tables([House], safe=True)
database.close()
def get_source_code(url):
try:
#result = requests.get(url, headers=hds[random.randint(0,len(hds)-1)])
result = requests.get(url)
source_code = result.content
except Exception as e:
print (e)
return
return source_code
def parse_house(url, info_dict):
source_code = get_source_code(url)
soup = BeautifulSoup(source_code, 'lxml')
divtag = soup.find_all('div', class_="infolist_box")
tds = []
for dttag in divtag:
bodytag = dttag.find_all("tbody")
for body in bodytag:
trtag = body.find_all("tr")
for tr in trtag:
tds.append(tr.findAll('td'))
try:
info_dict.update({'direction':tds[1][1].get_text().strip()})
info_dict.update({'floor':tds[3][0].get_text().strip()})
info_dict.update({'total_floor':tds[3][1].get_text().strip()})
info_dict.update({'year':tds[4][0].get_text().strip()})
info_dict.update({'decoration':tds[5][0].get_text().strip()})
except:
pass
House.insert(**info_dict).upsert().execute()
database_init()
for i in range(0,PAGE+1):
tds = []
source_code = get_source_code(BASE_URL+'list.aspx?pagenumber=' + str(i))
soup = BeautifulSoup(source_code, 'lxml')
divtag = soup.find_all('div', class_="infolist_box")
for dttag in divtag:
bodytag = dttag.find_all("tbody")
for body in bodytag:
trtag = body.find_all("tr")
for tr in trtag:
tds.append(tr.findAll('td'))
for td in tds:
info_dict = {}
info_dict.update({'id':td[0].get_text().strip()})
info_dict.update({'district':td[1].get_text().strip()})
info_dict.update({'name':td[2].get_text().strip()})
info_dict.update({'type':td[3].get_text().strip()})
info_dict.update({'square':td[4].get_text().strip()})
info_dict.update({'price':td[5].get_text().strip()[:-2]})
info_dict.update({'agency':td[6].get_text().strip()})
info_dict.update({'time':td[7].get_text().strip()})
info_dict.update({'url':BASE_URL + td[8].a.get('href')})
parse_house(BASE_URL + td[8].a.get('href'),info_dict)
print 'Page%d Finish' % i