forked from nytimes/covid-19-data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcovid-19-counties.py
265 lines (219 loc) · 9.25 KB
/
covid-19-counties.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
""" covid-19-co unties.py
workon covid
get latest data by running
git pull
from https://github.com/nytimes/covid-19-data.git
which updates csv files from the New York Times
us-counties.csv:
date,county,state,fips,cases,deaths
New York City does not use a fips
us-states.csv:
date,state,fips,cases,deaths
us.csv:
date,cases,deaths
pandas quickstart from
https://www.fullstackpython.com/blog/learn-pandas-basic-commands-explore-covid-19-data.html
"""
import os
from datetime import datetime
from math import ceil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
if __name__ == '__main__':
try:
import docopt
except ModuleNotFoundError:
print(f'docopt import failed for {__name__}. Use an environment with docopt installed, please.')
USAGE_TEXT = """
Usage:
covid-19-counties.py [--counties=<C>] [--debug=<D>][--get] [--lines] [--log] [--multi=<M>] [--norm] [--state=<S>]
covid-19-counties.py -h | --help
covid-19-counties.py -v | --version
Options:
-c --counties <C> Choose counties, comma sep, use quotes around whole thing [default: Santa Clara, Alameda, San Mateo, San Francisco, Contra Costa, Marin, Sonoma, Napa]
-d --debug <D> print opts,
-g --get Get current NY Times data.
-h --help Show this screen.
-l --lines Show the data in tabular form.
-o --log Use log Y scale
-m --multi <M> Multi all on one plot. c=cases, d=deaths. lower case for new, upper for totals.
-n --norm Plot normalized to population
-s --state <S> State [default: California]
-v --version show the version.
"""
def get_data():
""" curl https://opendata.ecdc.europa.eu/covid19/casedistribution/csv/ > req.csv
"""
print('Try\ngit fetch upstream\ninstead, Bob\n')
def show_county_stats(opts, df, county):
"""the grim stats of the selected df """
df3 = df[df['County'] == county.strip()]
if opts['--norm']:
print(f'total cases: {df3.iloc[-1]["total cases"]:0.2f} %')
print(f'total deaths: {df3.iloc[-1]["total deaths"]:0.2f} %')
else:
print(f'total cases: {df3.iloc[-1]["total cases"]:8,}')
print(f'total deaths: {df3.iloc[-1]["total deaths"]:8,}')
def display_one_county(opts, df, county):
"""Plot/list/stat a single county on a single graph
"""
# select a single county
df3 = df[df['County'] == county]
print(f'\n{county} {df3.iloc[-1]["fips"]:8,}')
if opts['--lines']:
show_county_stats(opts, df3, county)
def plot_multi_counties(opts, df, counties, pops):
"""Plot/stat all selected counties on a single graph.
"""
for county in counties: # show the stats
df2 = df[df['County'] == county]
print(f'\n{county}')
show_county_stats(opts, df, county)
# we've already filtered out all but the requested State
df3 = df[df['County'].isin(counties)] # keep only the counties I care about
df4 = df3[['Date', 'County', 'daily cases', 'daily deaths', 'total cases', 'total deaths', 'fips']]
y_col = choose_series(opts)
norm_str = ' (% of pop)' if opts['--norm'] else ' (count)'
title = f'{y_col}{norm_str}'
fig, ax = plt.subplots(figsize=(8, 5)) # 8 wide by 4 tall seems good
# df_sorted = df4.sort_values(y_col, ascending=False)
# groupby is magic. https://realpython.com/pandas-groupby/
for key, grp in df4.groupby(['County']):
# for key, grp in df_sorted.groupby(['County']):
if opts['--norm']:
y_str = f'{grp.iloc[-1][y_col]:0.3f} '
else:
y_str = f'{int(grp.iloc[-1][y_col]):8,} '
ax.plot(grp['Date'], grp[y_col], label=y_str+key, linewidth=7)
ax.legend() # show the legend
if opts['--log']:
ax.set_yscale('log')
plt.ylim(bottom=5)
# plt.ylim(bottom=int(opts['--threshold'])//2)
else:
plt.autoscale()
ax.set_title(title)
ax.xaxis.set_minor_locator(mdates.WeekdayLocator(byweekday=mdates.MO)) # every week x ticks
# I like the x axis labels these 2 lines make but I loose the cursor resolution
# ax.xaxis.set_major_locator(mdates.MonthLocator()) # major x: the first of every month
# ax.xaxis.set_major_formatter(mdates.DateFormatter('%b 1')) # x labels month day
# plt.ylim(bottom=int(opts['--threshold'])//2)
plt.grid(which='major', axis='both') # show both major axis
plt.grid(which='minor', axis='y', ls='dotted') # show y minor as dotted
ax.set_xlabel('https://github.com/BobBaylor/covid-19-data')
fig.autofmt_xdate() # rotate, right align, and leave room for date labels
plt.show()
def choose_series(opts):
""" CLI can choose what to graph on the multi-plot. If nothing chosen, then
return 'total cases'
"""
return {'c':'daily cases',
'd':'daily deaths',
'C':'total cases',
'D':'total deaths'}.get(opts['--multi'], 'total cases')
def get_county_list(opts, df, column):
"""CLI can choose specific counties, or just enter a number N to see the N highest
"""
top_number = None
try:
top_number = int(opts['--counties'])
except ValueError:
pass
if top_number:
# could these 2 lines be df.nlargest(top_number, column) ?
# maybe after a groupby County?
df_sorted = df.sort_values(column, ascending=False).drop_duplicates('County')
counties = df_sorted.County.tolist()[:top_number]
else:
counties = [county.strip() for county in opts['--counties'].split(',')]
return counties
def get_populations():
""" read a census csv of population (and a lot more) and package it into something
I can use to normalize graphs.
"""
# SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010, ...
# 050,4,9,06,085,California,Santa Clara County,1781642,1781686,1786040,1812054,...
POPULATION_FILE = 'co-est2019-alldata.csv'
df = pd.read_csv(POPULATION_FILE, encoding='ISO-8859-1')
df['population'] = df['POPESTIMATE2019']
df['state'] = df['STNAME']
df['County'] = df['CTYNAME']
# construct a fips series to make it easy to match the fips in the covid database
df['fips'] = df['STATE']*1000 + df['COUNTY']
# only keep what I care about
df = df[[ # 'state',
#'County',
'fips',
'population',
]]
return df
def test(opts):
"""Do the various things as requested
"""
print('*'*60, '\n'+'*'*60)
if opts['--debug']:
print(opts)
DATA_FILE = 'us-counties.csv'
if opts['--get']:
get_data() # go ask the WHO server for a new csv and save it
df = pd.read_csv(DATA_FILE, encoding='ISO-8859-1')
f_date = datetime.fromtimestamp(os.path.getmtime(DATA_FILE))
print(f'Data were retrieved {f_date:%B %d, %Y at %H:%M %p}')
# date,county,state,fips,cases,deaths
# 2020-01-21,Snohomish,Washington,53061,1,0
#except some entries have an empty fips field
# make plotable date and rename the cases and deaths for plotting
# I do it before selecting by county to avoid the SettingWithCopy warning
# which is pandas saying 'you're modifying a copy, not the actual dataset'
df['Date'] = pd.to_datetime(df.date, format='%Y-%m-%d')
df['total deaths'] = df['deaths']
df['total cases'] = df['cases']
df['County'] = df['county']
max_date = df['Date'].max()
df.sort_values(by=['state','county','Date'], inplace=True)
df['daily deaths'] = df.deaths.diff().fillna(0).astype(int)
df['daily cases'] = df.cases.diff().fillna(0).astype(int)
# df['fips'] = df.fips.fillna(0).astype(int)
df['fips'] = df.fips.fillna(36047.0).astype(np.float64)
print(f'Most recent date point is {max_date:%B %d, %Y at %H:%M %p}')
# todo: pull in test counts (both pos and neg) From where?
# only keep what I care about
df = df[['state',
'County',
'Date',
'total cases',
'total deaths',
'daily cases',
'daily deaths',
'fips',
]]
# smooth the daily data to a 7 day rolling average
df['daily cases'] = df['daily cases'].rolling(window=7).sum().divide(7.0)
df['daily deaths'] = df['daily deaths'].rolling(window=7).sum().divide(7.0)
# keep only the requested state
df = df[df['state'] == opts['--state']]
pops = get_populations()
df = pd.merge(df, pops, on='fips')
# normalize by population, if requested
if opts['--norm']:
for col in ['total cases', 'total deaths', 'daily cases', 'daily deaths']:
df[[col]] = df[[col]].div(df.population, axis=0)
df[[col]] = df[[col]].mul(100.0)
y_col = choose_series(opts)
counties = get_county_list(opts, df, y_col)
# filter out small numbers of cases in a day
if opts['--norm']:
df = df[df['daily cases'] >= 0]
else:
df = df[df['daily cases'] >= 2]
plt.close('all')
if opts['--multi']:
plot_multi_counties(opts, df, counties, pops)
else:
for county in counties:
display_one_county(opts, df, county)
if __name__ == '__main__':
opts = docopt.docopt(USAGE_TEXT, version='0.0.5')
test(opts)