-
Notifications
You must be signed in to change notification settings - Fork 1
/
deduplicate_csv.py
127 lines (106 loc) · 4.33 KB
/
deduplicate_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""
Command-line tool to operate on csv files
:usage:
$ python -m meter_util.deduplicate_csv -i <infile> -o <outfile>
"""
import argparse
import json
import sys
import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
import networkx as nx
from shapely.geometry import Point
# The max distance in km between points
HAVERSINE_THRESHOLD = 0.01
def haversine(lat1, lon1, lat2, lon2):
"""obtain haversize distance for arrays of lat and lon.
Args:
lat1 (arr): latitudes of first array
lon1 (arr): longitudes of first array
lat2 (arr): latitudes of second array
lon2 (arr): longitudes of second array
Returns:
An array of haversine distances in km between each pair of lat&lon in the input arrays
"""
lat1 = lat1*np.pi/180.0
lon1 = np.deg2rad(lon1)
lat2 = np.deg2rad(lat2)
lon2 = np.deg2rad(lon2)
d = np.sin((lat2 - lat1)/2)**2 + np.cos(lat1) * \
np.cos(lat2) * np.sin((lon2 - lon1)/2)**2
return 2 * 6371 * np.arcsin(np.sqrt(d))
def deduplicate_df(df):
"""deduplicate dataframe of lat&lon points that are near each other.
Args:
df: dataframe to deduplicate
Returns:
A new dedeuplicated dataframe
"""
lats = df['Latitude'].values
lons = df['Longitude'].values
to_drop = []
for i, row in df.iterrows():
if i not in to_drop:
lat, lon = row['Latitude'], row['Longitude']
# Computing haversine distance over all boxes is slow
# Only consider boxes within 0.1 lat/lon
lat_is_close = np.isclose(lats, lat, rtol=0, atol=0.1)
lon_is_close = np.isclose(lons, lon, rtol=0, atol=0.1)
lat_and_lon_are_close = lat_is_close & lon_is_close
distances = haversine(
lats[lat_and_lon_are_close],
lons[lat_and_lon_are_close],
[lat], [lon]
)
# Only take points within HAVERSINE_THRESHOLD
lat_lon_is_close = np.where(distances < HAVERSINE_THRESHOLD)[0]
neighbors = np.where(lat_and_lon_are_close)[0][lat_lon_is_close]
# Drop every neighbor other than the first row being modified
to_drop += list(np.setdiff1d(neighbors, np.array([i])).flatten())
rows = [df.loc[neighbor] for neighbor in neighbors]
if len(rows) > 1:
df_to_modify = pd.DataFrame(rows)
df.loc[i, ["Latitude"]] = df_to_modify["Latitude"].mean()
df.loc[i, ["Longitude"]] = df_to_modify["Longitude"].mean()
df.loc[i, ["Source"]] = '-'.join(df_to_modify['Source'].unique())
#TODO: Modify Date Column ?
print("Removed ", len(set(to_drop)), " from ", df.shape[0], " datapoints ")
return df.drop(index=set(to_drop))
def main():
prog = 'python -m meter_util.deduplicate_csv -i <infile> -o <outfile>'
description = ('A simple command line interface for csv files '
'to deduplicate longtitude and latitude points within a range.')
parser = argparse.ArgumentParser(prog=prog, description=description)
parser.add_argument('-i', '--infile', nargs='?',
help='input file',
default=None)
parser.add_argument('-o', '--outfile', nargs='?',
help='output file',
default=None)
options = parser.parse_args()
if not options.infile or not options.outfile:
parser.print_help()
sys.exit()
try:
processed_df_list = []
df = pd.read_csv(options.infile)
types = df['Type'].unique()
for source_type in types:
print("Deduplicating ", source_type)
# Deduplicate df by specific source
temp_df = df[df.Type == source_type].reset_index()
deduplicated_df = deduplicate_df(temp_df)
processed_df_list.append(deduplicated_df)
# Concat everything together and drop unneccesary columns
main_df = pd.concat(processed_df_list).drop(columns=['index', 'Unnamed: 0'])
# Output csv
main_df.to_csv(options.outfile)
except ValueError as e:
raise SystemExit(e)
if __name__ == '__main__':
try:
main()
except BrokenPipeError as exc:
sys.exit(exc.errno)