-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcountByMonthParking.py
94 lines (79 loc) · 3.64 KB
/
countByMonthParking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!usr/bin/python
import pandas as pd
import re
import pdb
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
addressByTicketCounts = pd.read_csv('./address_by_ticket_counts_2007_2008.csv')
parkingData = pd.read_csv('./parking2007_2008.csv')
# Get geolocation for the highest ranked 100 addresses by ticket number
numOfTopAddress = 100
topAddresses = addressByTicketCounts.address[:numOfTopAddress]
topCounts = addressByTicketCounts['count'][:numOfTopAddress]
topAddressMax = addressByTicketCounts['max'][:numOfTopAddress]
for ind,address in topAddresses.iteritems():
datesThisAddress = parkingData.loc[parkingData.address==address][['Date','Amount Due']]
datesThisAddress['Date'] = pd.to_datetime(datesThisAddress['Date'])
datesThisAddress.set_index('Date', inplace=True)
amountDueEachMonth = datesThisAddress.groupby(pd.TimeGrouper(freq='M')).agg(['sum','count']) #group by month and aggregate on sum of ticket amount due and number of tickets
#pdb.set_trace()
# Convert each address's stats into columns
statsThisAddress = amountDueEachMonth.T.unstack(level=-1).rename(index={'Amount Due': address}) #The resulting dataframe has this address as index and multi-level column first level being date (by month), second level being 'sum' and 'count'
statsThisAddress.columns = ['_'.join([col[0].strftime('%b'),col[1]]) for col in statsThisAddress.columns] #convert multi-level columns into single level
statsThisAddress = statsThisAddress.reset_index().rename(columns={'index': 'address'}) #put the address in index into a column called 'address'
if ind == 0:
outputDf = statsThisAddress
else:
outputDf = pd.concat([outputDf, statsThisAddress], ignore_index=True)
#pdb.set_trace()
#countByDate = datesThisAddress.value_counts().sort_index(ascending=True)
#datesToPlot = countByDate.index
#countsToPlot = countByDate.values
#datesToPlot = matplotlib.dates.date2num(datetimes)
#plt.clf()
#print 'plotting figure for {}'.format(address)
#trendLinePlot = plt.plot(datesToPlot, countsToPlot, 'k.-')
#plt.ylabel('Tickets per day')
#plt.savefig('./figures/{}.png'.format(address))
outputDf.to_csv('./ticket_counts_n_total_fine_by_month_2007_2008.csv')
'''
batchSize = 200
numBatch = int(np.ceil(len(ticketAmountByLocation)/float(batchSize)))
outFile = './parking_aggregate_2007_2008_geocoded.csv'
for batch in range(numBatch):
thisBatch = ticketAmountByLocation[batch*batchSize: (batch+1)*batchSize]
addressCol = pd.Series(index=thisBatch.index, dtype=object)
latitudeCol = pd.Series(index=thisBatch.index, dtype=float)
longitudeCol = pd.Series(index=thisBatch.index, dtype=float)
print 'processing batch {}'.format(batch)
for ind, location in thisBatch.Location.iteritems():
streetPattern = re.compile('[A-Z0-9]+(?=\sAVE|\sST)')
streetNames = re.findall(streetPattern, location)
if len(streetNames) >= 2:
address = ' & '.join(sorted(streetNames[:2]))
else:
address = location
addressCol[ind] = address
fullAddress = address + ', Eugene, OR'
print 'Geocoding {}'.format(fullAddress)
results = geolocator.geocode(fullAddress, exactly_one=True)
if (results != None):
address, (latitude, longitude) = results
latitudeCol[ind] = latitude
longitudeCol[ind] = longitude
else:
latitudeCol[ind] = np.NaN
longitudeCol[ind] = np.NaN
#pdb.set_trace()
thisBatch['address'] = addressCol.values
thisBatch['latitude'] = latitudeCol.values
thisBatch['longitude'] = longitudeCol.values
if batch == 0:
with open(outFile,'w') as f:
thisBatch.to_csv(f)
else:
with open(outFile,'a') as f:
thisBatch.to_csv(f, header=False)
time.sleep(60)
'''