-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathio.py
79 lines (58 loc) · 2.08 KB
/
io.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import pandas as pd
import numpy as np
##
##
##
def load_data(data_folder = '/gh/data/flightdelay/', N_flights = None):
"""Load all airline, airport, and flight data
Returns
-------
df_al : pandas DataFrame
Airlines data.
df_ap : pandas DataFrame
Airports data.
df_fl : pandas DataFrame
Flights data.
"""
df_al = pd.DataFrame.from_csv(data_folder+'airlines.csv')
df_ap = pd.DataFrame.from_csv(data_folder+'airports.csv')
if N_flights is None:
df_fl = pd.io.parsers.read_csv(data_folder+'flights.csv')
else:
df_fl = pd.io.parsers.read_csv(data_folder+'flights.csv', nrows = N_flights)
return df_al, df_ap, df_fl
def load_data_lines_and_ports(data_folder = '/gh/data/flightdelay/'):
"""Load all airline, airport, and flight data"""
df_al = pd.DataFrame.from_csv(data_folder+'airlines.csv')
df_ap = pd.DataFrame.from_csv(data_folder+'airports.csv')
return df_al, df_ap
def load_data_SAN(data_folder = '/gh/data/flightdelay/',
old_data = False,
drop_cancelled = True):
"""Load flights departing from SAN"""
# Load all data
_, _, df_fl = load_data(data_folder = data_folder)
# Restrict to SAN data
restrict = {}
# If on old data, need to use the 5-digit airport code
if old_data:
restrict['ORIGIN_AIRPORT'] = ['SAN','14679',14679]
else:
restrict['ORIGIN_AIRPORT'] = ['SAN']
# Apply restriction
df_SAN = restrict_df(df_fl, restrict)
# If needed, remove the cancelled flights
if drop_cancelled:
df_SAN = df_SAN[np.isfinite(df_SAN['DEPARTURE_DELAY'])]
return df_SAN
def restrict_df(df, restriction):
"""Restrict `df` to only the rows in which the key of `restriction`
takes on one of the values in the associated list"""
restrict_keys = restriction.keys()
for k in restrict_keys:
N_vals = len(restriction[k])
df_keep = [0]*N_vals
for i in range(N_vals):
df_keep[i] = df[df[k]==restriction[k][i]]
df = pd.concat(df_keep)
return df