-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathprepare_geo_features.py
69 lines (55 loc) · 1.85 KB
/
prepare_geo_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import pandas as pd
import codecs
import json
import os, sys
from tqdm import tqdm
from time import sleep
from zipfile import ZipFile
from urllib2 import urlopen
print 'reading data...'
data = sys.argv[1]
zip_file = ZipFile('../input/ItemInfo_%s.csv.zip' % data).open('ItemInfo_%s.csv' % data)
df_train_info = pd.read_csv(zip_file, usecols=['itemID', 'lat', 'lon'])
result_file = 'geo-%s.txt' % data
processed = set()
if os.path.exists(result_file):
print 'result file %s aready exists, appending...' % result_file
with codecs.open(result_file, 'r', 'utf8') as in_f:
for line in in_f:
split = line.split('\t')
ad_id = split[0]
processed.add(int(ad_id))
print 'already processed %d records, picking up where we stopped...' % len(processed)
result = codecs.open(result_file, 'a', 'utf8')
else:
result = codecs.open(result_file, 'w', 'utf8')
df = df_train_info[~df_train_info.itemID.isin(processed)]
geo_triples = zip(df.itemID, df.lat, df.lon)
print 'processing...'
nominatim = 'http://172.17.0.2:8080/reverse?format=json&lat=%0.5f&lon=%0.5f'
def reverse(lat, lon):
res = urlopen(nominatim % (lat, lon)).read()
loc = json.loads(res)
address = loc['address']
postcode = address.get('postcode')
state = address.get('state')
city = address.get('city', state)
return state, city, postcode
def try_process(id, lat, lon):
state, city, postcode = reverse(lat, lon)
result.write(str(id))
result.write('\t')
result.write(unicode(state))
result.write('\t')
result.write(unicode(city))
result.write('\t')
result.write(str(postcode))
result.write('\n')
result.flush()
for id, lat, lon in tqdm(geo_triples):
try:
try_process(id, lat, lon)
except:
print 'error processing %d, lat=%0.5f&lon=%0.5f' % (id, lat, lon)
print 'done'
result.close()