Skip to content

Commit

Permalink
Adding experiment results
Browse files Browse the repository at this point in the history
  • Loading branch information
cscully-allison committed Dec 8, 2019
1 parent 5df1adc commit f9fa5cd
Show file tree
Hide file tree
Showing 6 changed files with 223 additions and 29 deletions.
11 changes: 11 additions & 0 deletions EDSL/experiments_edsl.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Spike Test, Missing Value Test, Repeat Value Test, Outlier, Spatial Inconsistency, Logical Inconsistency, Spike Test
0.04310178756713867, 0.9675548076629639, 0.5453481674194336, 2.464052200317383, 0.9644520282745361, 1.6780734062194824
0.045937538146972656, 0.6281876564025879, 0.5570821762084961, 2.3783037662506104, 0.6530992984771729, 1.685499668121338
0.029288053512573242, 0.6271235942840576, 0.5480241775512695, 2.353997230529785, 0.6515278816223145, 1.6660075187683105
0.030195236206054688, 0.626065731048584, 0.5490946769714355, 2.4371140003204346, 0.6704967021942139, 1.6959545612335205
0.029053211212158203, 0.6258211135864258, 0.5637242794036865, 2.3955459594726562, 0.6635918617248535, 1.6819257736206055
0.02913188934326172, 0.623931884765625, 0.5477449893951416, 2.3734071254730225, 0.6677625179290771, 1.699209213256836
0.03390693664550781, 0.6345832347869873, 0.5562515258789062, 2.4408748149871826, 0.6964719295501709, 1.7642099857330322
0.02899956703186035, 0.6489384174346924, 0.6134247779846191, 2.4096388816833496, 0.6748590469360352, 1.7084054946899414
0.03548884391784668, 0.6452851295471191, 0.5539658069610596, 2.4438226222991943, 0.6763548851013184, 1.7239494323730469
0.028683900833129883, 0.6429176330566406, 0.5655772686004639, 2.450761318206787, 0.6836907863616943, 1.7262461185455322
20 changes: 12 additions & 8 deletions EDSL/mockup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

DS.flagcodes().are({"None":"OK", "Repeat Value":"Repeat Value", "Missing Value": "Missing", "Outlier": "Exceeds Range", "Spatial Inconsistency": "Incosistent (Spatial)", "Logical Inconsistency": "Inconsistent (Logical)", "Spike": "Spike"})



series_max = DS['Air temperature (2-meter) monitor_Maximum']
series_min = DS['Air temperature (2-meter) monitor_Minimum']
series_max_10 = DS['Air temperature (10-meter) monitor_Maximum']
Expand Down Expand Up @@ -87,18 +85,24 @@ def slope_test(value, i):


series_max.datapoint().flag('Missing Value').missingValueTest(-9999)

series_max.datapoint().flag("Repeat Value").when(rv_test)

series_max.datapoint().flag("Outlier").when(range_test)

series_max.datapoint().flag("Spatial Inconsistency").when(spatial_inconsistency)

series_max.datapoint().flag("Logical Inconsistency").when(logical_inconsistency)

series_max.datapoint().flag("Spike").when(slope_test)

series_min.datapoint().flag('Missing Value').missingValueTest(-9999)
series_min.datapoint().flag("Repeat Value").when(rv_test)
series_min.datapoint().flag("Outlier").when(range_test)
series_min.datapoint().flag("Spatial Inconsistency").when(spatial_inconsistency)
series_min.datapoint().flag("Logical Inconsistency").when(logical_inconsistency_min)
series_min.datapoint().flag("Spike").when(slope_test)

# series_min.datapoint().flag('Missing Value').missingValueTest(-9999)
# series_min.datapoint().flag("Repeat Value").when(rv_test)
# series_min.datapoint().flag("Outlier").when(range_test)
# series_min.datapoint().flag("Spatial Inconsistency").when(spatial_inconsistency)
# series_min.datapoint().flag("Logical Inconsistency").when(logical_inconsistency_min)
# series_min.datapoint().flag("Spike").when(slope_test)



Expand Down
32 changes: 19 additions & 13 deletions EDSL/pandas_imp/Testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,23 +84,29 @@ def __init__ (self, testId = 1, **kwargs):
self.precision = parameter['Value']

self.timestep = np.timedelta64(self.time, self.precision)
self.timestep = pd.Timedelta(self.timestep)

# data must be a float list
# returns a set of boolean flags
def runTest (self, dataframe):
newts = []

for i, val in enumerate(dataframe[self.column]):
if i < len(dataframe[self.column])-1:
first = dataframe[self.column][i]
next = dataframe[self.column][i+1]
if (np.timedelta64(next-first) > self.timestep):
tempts = first + self.timestep
while not( tempts == next):
newts.append(tempts)
tempts = tempts + self.timestep

return newts
# newts = []
#
# for i, val in enumerate(dataframe[self.column]):
# if i < len(dataframe[self.column])-1:
# first = dataframe[self.column][i]
# next = dataframe[self.column][i+1]
# if (np.timedelta64(next-first) > self.timestep):
# tempts = first + self.timestep
# while not( tempts == next):
# newts.append(tempts)
# tempts = tempts + self.timestep
#
# return newts

tmp = dataframe.set_index(self.column)
tmp = tmp.reindex(pd.date_range(start=tmp.index[0], end=tmp.index[-1], freq=self.timestep))
return tmp

# return outdf.apply(lambda x: self.flag.flag(x, self.testName))

class SpikeTest(Test):
Expand Down
5 changes: 1 addition & 4 deletions EDSL/pandas_imp/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
df['Time Stamp ((UTC-08:00) Pacific Time (US & Canada))'] = pd.to_datetime(df['Time Stamp ((UTC-08:00) Pacific Time (US & Canada))'])


for i in range(0,8):
for i in range(0,3):

timedeltas = []

Expand Down Expand Up @@ -59,7 +59,6 @@
testspec['Parameters'].append({'Name':'Min', 'Data Type': 'Float', 'Value': -20})
testspec['Column'] = 'TC_C_2_Avg degC Average'


rt = RangeTest(1, **testspec)


Expand All @@ -84,8 +83,6 @@
sit = SpatialInconsistencyTest(2, **testspec)
end = time.time()



print("\n Spatial Inconsistency Test ---------------------")

start = time.time()
Expand Down
13 changes: 13 additions & 0 deletions EDSL/pandas_imp/experiments.csv
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,16 @@ Spike Test, Basic Outlier Test, Spatial Inconsistency Test, Logical Inconsistenc
0.039907217025757,0.117334842681885,0.132239580154419,0.149530649185181,0.303349256515503,39.4851911067963
0.049628973007202,0.153321981430054,0.161677598953247,0.163922309875488,0.312518835067749,39.2585682868958
0.040935039520264,0.117730379104614,0.129293918609619,0.148436307907105,0.291978120803833,42.3061537742615

0.021251440048217773, 0.057301998138427734, 0.06179404258728027, 0.07192039489746094, 0.14961671829223633, 0.018226146697998047
0.02369070053100586, 0.06433486938476562, 0.06124520301818848, 0.08005118370056152, 0.14654231071472168, 0.01706838607788086
0.024653911590576172, 0.0613398551940918, 0.06501054763793945, 0.08289575576782227, 0.1437990665435791, 0.021692991256713867
0.03632616996765137, 0.06933832168579102, 0.06191253662109375, 0.07978487014770508, 0.1490628719329834, 0.02353668212890625
0.025202512741088867, 0.06309866905212402, 0.06527137756347656, 0.08081793785095215, 0.15391206741333008, 0.030036211013793945
0.019873857498168945, 0.0765230655670166, 0.05884146690368652, 0.0701761245727539, 0.1713271141052246, 0.01934194564819336
0.02305316925048828, 0.057373046875, 0.06122612953186035, 0.07098889350891113, 0.1558516025543213, 0.019112586975097656
0.019095182418823242, 0.058248043060302734, 0.06231188774108887, 0.07080888748168945, 0.13739943504333496, 0.01940321922302246

0.021418094635009766, 0.05525803565979004, 0.059787750244140625, 0.07110595703125, 0.14096474647521973, 0.019846200942993164
0.020044803619384766, 0.05419206619262695, 0.0565643310546875, 0.07096314430236816, 0.14044833183288574, 0.017404556274414062
0.019895315170288086, 0.05482983589172363, 0.06525945663452148, 0.07382369041442871, 0.13193225860595703, 0.019867897033691406
171 changes: 167 additions & 4 deletions EDSL/test.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,172 @@
from Materia import *
import time

# DS = Dataset("data/rockland.csv", numHeaderLines=9)
DS = Dataset("data/rockland.csv", numHeaderLines=9)
DS2 = Dataset("data/evergreen.csv")
DS3 = Dataset("data/bigelow_soilMTP_2017.csv", numHeaderLines=2)

# print(DS)
print(DS2)
print(DS3)
DS.genHeadersFromMetadataRows((1,6))

DS.flagcodes().are({"None":"OK", "Repeat Value":"Repeat Value", "Missing Value": "Missing", "Outlier": "Exceeds Range", "Spatial Inconsistency": "Incosistent (Spatial)", "Logical Inconsistency": "Inconsistent (Logical)", "Spike": "Spike", "Hardware Range": "Exceeds Hardware Range"})

series_max = DS['Air temperature (2-meter) monitor_Maximum']
series_min = DS['Air temperature (2-meter) monitor_Minimum']
series_max_10 = DS['Air temperature (10-meter) monitor_Maximum']
series_avg_2 = DS['Air temperature (2-meter) monitor_Average']
series_avg_10 = DS['Air temperature (10-meter) monitor_Average']

series_max.timestep((series_max.beginning(+1)) - series_max.beginning())
series_min.timestep((series_min.beginning(+1)) - series_min.beginning())
series_avg_2.timestep((series_min.beginning(+1)) - series_min.beginning())

def rv_test(value):
n = 3
if not value.isnan():
return value == value.prior(n)

def range_test(value):
return (value < -20 or value > 20)

def hardware_range_test(value):
return (value < -100 or value > 100)

def spatial_inconsistency(value, i):
comp_val = series_max_10.value().at(value)
threshold = abs(value * 2)

if comp_val > (value + threshold) or comp_val < (value - threshold):
return True

return False

def avg_spatial_inconsistency(value, i):
comp_val = series_avg_10.value().at(value)
diff = value - comp_val
avg = (value + comp_val) / 2
threshold = 2

threshold = abs(diff / avg) * 100.0 < threshold

if comp_val > (value + threshold) or comp_val < (value - threshold):
return True

return False

def logical_inconsistency_min(min_value):
max_value = series_max.value().at(min_value)

if min_value > max_value:
return True

return False

# checks for logical inconsistency in DataSet
# eg. max value is less than min value
def logical_inconsistency(max_value):
min_val = series_min.value().at(max_value)

if min_val > max_value:
return True

return False

# compares slopes between values
def slope_test(value, i):
p_a = value.prior(2)[0]
p_b = value.prior(2)[1]
p_c = value

priorslp = 0.0

if p_a.isScalar() and p_b.isScalar():
# x values
x1 = p_a
x2 = p_b

# y values
y1 = p_a.intIndex()
y2 = p_b.intIndex()

# current slope
priorslp = (y2-y1)/(x2-x1)

# x ad y values for next point
x3 = p_c
y3 = p_c.intIndex()

# next slope
nextslp = (y3-y2)/(x3-x2)

if (abs(nextslp) < .1 #very sharp slope
and abs(nextslp) < abs(priorslp*.01) #big difference between the two slopes
and abs(nextslp) != float("inf") #slope is not a flat line
and abs(priorslp) != float("inf")): #slope is not a flat line
return True

return False

for i in range(10):

timedeltas = []

print("\n Missing Value Test ---------------------")
start = time.time()
series_avg_2.datapoint().flag('Missing Value').missingValueTest(-9999)
end = time.time()

print(end-start)

timedeltas.append(end-start)

print("\n Repeat Value Test ---------------------")
start = time.time()
series_avg_2.datapoint().flag("Repeat Value").when(rv_test)
end = time.time()

print(end-start)

timedeltas.append(end-start)

print("\n Outlier Inconsistency ---------------------")
start = time.time()
series_avg_2.datapoint().flag("Outlier").when(range_test)
end = time.time()

print(end-start)

timedeltas.append(end-start)

print("\n Spatial Inconsistency ---------------------")
start = time.time()
series_avg_2.datapoint().flag("Spatial Inconsistency").when(avg_spatial_inconsistency)
end = time.time()

print(end-start)

timedeltas.append(end-start)

print("\n Logical Inconsistency ---------------------")
start = time.time()
series_max.datapoint().flag("Logical Inconsistency").when(logical_inconsistency)
end = time.time()

print(end-start)

timedeltas.append(end-start)

print("\n Spike Test ---------------------")
start = time.time()
series_avg_2.datapoint().flag("Spike").when(slope_test)
end = time.time()

print(end-start)

timedeltas.append(end-start)

with open('experiments_edsl.csv', 'a') as f:
f.write('\n')
for i, d in enumerate(timedeltas):
if i is len(timedeltas)-1:
f.write(str(d))
else:
f.write(str(d)+', ')

0 comments on commit f9fa5cd

Please sign in to comment.