diff --git a/EDSL/experiments_edsl.csv b/EDSL/experiments_edsl.csv new file mode 100644 index 0000000..67eda9a --- /dev/null +++ b/EDSL/experiments_edsl.csv @@ -0,0 +1,11 @@ +Spike Test, Missing Value Test, Repeat Value Test, Outlier, Spatial Inconsistency, Logical Inconsistency, Spike Test +0.04310178756713867, 0.9675548076629639, 0.5453481674194336, 2.464052200317383, 0.9644520282745361, 1.6780734062194824 +0.045937538146972656, 0.6281876564025879, 0.5570821762084961, 2.3783037662506104, 0.6530992984771729, 1.685499668121338 +0.029288053512573242, 0.6271235942840576, 0.5480241775512695, 2.353997230529785, 0.6515278816223145, 1.6660075187683105 +0.030195236206054688, 0.626065731048584, 0.5490946769714355, 2.4371140003204346, 0.6704967021942139, 1.6959545612335205 +0.029053211212158203, 0.6258211135864258, 0.5637242794036865, 2.3955459594726562, 0.6635918617248535, 1.6819257736206055 +0.02913188934326172, 0.623931884765625, 0.5477449893951416, 2.3734071254730225, 0.6677625179290771, 1.699209213256836 +0.03390693664550781, 0.6345832347869873, 0.5562515258789062, 2.4408748149871826, 0.6964719295501709, 1.7642099857330322 +0.02899956703186035, 0.6489384174346924, 0.6134247779846191, 2.4096388816833496, 0.6748590469360352, 1.7084054946899414 +0.03548884391784668, 0.6452851295471191, 0.5539658069610596, 2.4438226222991943, 0.6763548851013184, 1.7239494323730469 +0.028683900833129883, 0.6429176330566406, 0.5655772686004639, 2.450761318206787, 0.6836907863616943, 1.7262461185455322 diff --git a/EDSL/mockup.py b/EDSL/mockup.py index 7d51dfd..cce77be 100644 --- a/EDSL/mockup.py +++ b/EDSL/mockup.py @@ -8,8 +8,6 @@ DS.flagcodes().are({"None":"OK", "Repeat Value":"Repeat Value", "Missing Value": "Missing", "Outlier": "Exceeds Range", "Spatial Inconsistency": "Incosistent (Spatial)", "Logical Inconsistency": "Inconsistent (Logical)", "Spike": "Spike"}) - - series_max = DS['Air temperature (2-meter) monitor_Maximum'] series_min = DS['Air temperature (2-meter) monitor_Minimum'] series_max_10 = DS['Air temperature (10-meter) monitor_Maximum'] @@ -87,18 +85,24 @@ def slope_test(value, i): series_max.datapoint().flag('Missing Value').missingValueTest(-9999) + series_max.datapoint().flag("Repeat Value").when(rv_test) + series_max.datapoint().flag("Outlier").when(range_test) + series_max.datapoint().flag("Spatial Inconsistency").when(spatial_inconsistency) + series_max.datapoint().flag("Logical Inconsistency").when(logical_inconsistency) + series_max.datapoint().flag("Spike").when(slope_test) -series_min.datapoint().flag('Missing Value').missingValueTest(-9999) -series_min.datapoint().flag("Repeat Value").when(rv_test) -series_min.datapoint().flag("Outlier").when(range_test) -series_min.datapoint().flag("Spatial Inconsistency").when(spatial_inconsistency) -series_min.datapoint().flag("Logical Inconsistency").when(logical_inconsistency_min) -series_min.datapoint().flag("Spike").when(slope_test) + +# series_min.datapoint().flag('Missing Value').missingValueTest(-9999) +# series_min.datapoint().flag("Repeat Value").when(rv_test) +# series_min.datapoint().flag("Outlier").when(range_test) +# series_min.datapoint().flag("Spatial Inconsistency").when(spatial_inconsistency) +# series_min.datapoint().flag("Logical Inconsistency").when(logical_inconsistency_min) +# series_min.datapoint().flag("Spike").when(slope_test) diff --git a/EDSL/pandas_imp/Testing.py b/EDSL/pandas_imp/Testing.py index 9f2bf3c..7b17397 100644 --- a/EDSL/pandas_imp/Testing.py +++ b/EDSL/pandas_imp/Testing.py @@ -84,23 +84,29 @@ def __init__ (self, testId = 1, **kwargs): self.precision = parameter['Value'] self.timestep = np.timedelta64(self.time, self.precision) + self.timestep = pd.Timedelta(self.timestep) # data must be a float list # returns a set of boolean flags def runTest (self, dataframe): - newts = [] - - for i, val in enumerate(dataframe[self.column]): - if i < len(dataframe[self.column])-1: - first = dataframe[self.column][i] - next = dataframe[self.column][i+1] - if (np.timedelta64(next-first) > self.timestep): - tempts = first + self.timestep - while not( tempts == next): - newts.append(tempts) - tempts = tempts + self.timestep - - return newts + # newts = [] + # + # for i, val in enumerate(dataframe[self.column]): + # if i < len(dataframe[self.column])-1: + # first = dataframe[self.column][i] + # next = dataframe[self.column][i+1] + # if (np.timedelta64(next-first) > self.timestep): + # tempts = first + self.timestep + # while not( tempts == next): + # newts.append(tempts) + # tempts = tempts + self.timestep + # + # return newts + + tmp = dataframe.set_index(self.column) + tmp = tmp.reindex(pd.date_range(start=tmp.index[0], end=tmp.index[-1], freq=self.timestep)) + return tmp + # return outdf.apply(lambda x: self.flag.flag(x, self.testName)) class SpikeTest(Test): diff --git a/EDSL/pandas_imp/driver.py b/EDSL/pandas_imp/driver.py index 179c51d..867c320 100644 --- a/EDSL/pandas_imp/driver.py +++ b/EDSL/pandas_imp/driver.py @@ -25,7 +25,7 @@ df['Time Stamp ((UTC-08:00) Pacific Time (US & Canada))'] = pd.to_datetime(df['Time Stamp ((UTC-08:00) Pacific Time (US & Canada))']) -for i in range(0,8): +for i in range(0,3): timedeltas = [] @@ -59,7 +59,6 @@ testspec['Parameters'].append({'Name':'Min', 'Data Type': 'Float', 'Value': -20}) testspec['Column'] = 'TC_C_2_Avg degC Average' - rt = RangeTest(1, **testspec) @@ -84,8 +83,6 @@ sit = SpatialInconsistencyTest(2, **testspec) end = time.time() - - print("\n Spatial Inconsistency Test ---------------------") start = time.time() diff --git a/EDSL/pandas_imp/experiments.csv b/EDSL/pandas_imp/experiments.csv index 85f86d7..c536570 100644 --- a/EDSL/pandas_imp/experiments.csv +++ b/EDSL/pandas_imp/experiments.csv @@ -9,3 +9,16 @@ Spike Test, Basic Outlier Test, Spatial Inconsistency Test, Logical Inconsistenc 0.039907217025757,0.117334842681885,0.132239580154419,0.149530649185181,0.303349256515503,39.4851911067963 0.049628973007202,0.153321981430054,0.161677598953247,0.163922309875488,0.312518835067749,39.2585682868958 0.040935039520264,0.117730379104614,0.129293918609619,0.148436307907105,0.291978120803833,42.3061537742615 + +0.021251440048217773, 0.057301998138427734, 0.06179404258728027, 0.07192039489746094, 0.14961671829223633, 0.018226146697998047 +0.02369070053100586, 0.06433486938476562, 0.06124520301818848, 0.08005118370056152, 0.14654231071472168, 0.01706838607788086 +0.024653911590576172, 0.0613398551940918, 0.06501054763793945, 0.08289575576782227, 0.1437990665435791, 0.021692991256713867 +0.03632616996765137, 0.06933832168579102, 0.06191253662109375, 0.07978487014770508, 0.1490628719329834, 0.02353668212890625 +0.025202512741088867, 0.06309866905212402, 0.06527137756347656, 0.08081793785095215, 0.15391206741333008, 0.030036211013793945 +0.019873857498168945, 0.0765230655670166, 0.05884146690368652, 0.0701761245727539, 0.1713271141052246, 0.01934194564819336 +0.02305316925048828, 0.057373046875, 0.06122612953186035, 0.07098889350891113, 0.1558516025543213, 0.019112586975097656 +0.019095182418823242, 0.058248043060302734, 0.06231188774108887, 0.07080888748168945, 0.13739943504333496, 0.01940321922302246 + +0.021418094635009766, 0.05525803565979004, 0.059787750244140625, 0.07110595703125, 0.14096474647521973, 0.019846200942993164 +0.020044803619384766, 0.05419206619262695, 0.0565643310546875, 0.07096314430236816, 0.14044833183288574, 0.017404556274414062 +0.019895315170288086, 0.05482983589172363, 0.06525945663452148, 0.07382369041442871, 0.13193225860595703, 0.019867897033691406 \ No newline at end of file diff --git a/EDSL/test.py b/EDSL/test.py index 61322f2..bd7ff1c 100644 --- a/EDSL/test.py +++ b/EDSL/test.py @@ -1,9 +1,172 @@ from Materia import * +import time -# DS = Dataset("data/rockland.csv", numHeaderLines=9) +DS = Dataset("data/rockland.csv", numHeaderLines=9) DS2 = Dataset("data/evergreen.csv") DS3 = Dataset("data/bigelow_soilMTP_2017.csv", numHeaderLines=2) -# print(DS) -print(DS2) -print(DS3) +DS.genHeadersFromMetadataRows((1,6)) + +DS.flagcodes().are({"None":"OK", "Repeat Value":"Repeat Value", "Missing Value": "Missing", "Outlier": "Exceeds Range", "Spatial Inconsistency": "Incosistent (Spatial)", "Logical Inconsistency": "Inconsistent (Logical)", "Spike": "Spike", "Hardware Range": "Exceeds Hardware Range"}) + +series_max = DS['Air temperature (2-meter) monitor_Maximum'] +series_min = DS['Air temperature (2-meter) monitor_Minimum'] +series_max_10 = DS['Air temperature (10-meter) monitor_Maximum'] +series_avg_2 = DS['Air temperature (2-meter) monitor_Average'] +series_avg_10 = DS['Air temperature (10-meter) monitor_Average'] + +series_max.timestep((series_max.beginning(+1)) - series_max.beginning()) +series_min.timestep((series_min.beginning(+1)) - series_min.beginning()) +series_avg_2.timestep((series_min.beginning(+1)) - series_min.beginning()) + +def rv_test(value): + n = 3 + if not value.isnan(): + return value == value.prior(n) + +def range_test(value): + return (value < -20 or value > 20) + +def hardware_range_test(value): + return (value < -100 or value > 100) + +def spatial_inconsistency(value, i): + comp_val = series_max_10.value().at(value) + threshold = abs(value * 2) + + if comp_val > (value + threshold) or comp_val < (value - threshold): + return True + + return False + +def avg_spatial_inconsistency(value, i): + comp_val = series_avg_10.value().at(value) + diff = value - comp_val + avg = (value + comp_val) / 2 + threshold = 2 + + threshold = abs(diff / avg) * 100.0 < threshold + + if comp_val > (value + threshold) or comp_val < (value - threshold): + return True + + return False + +def logical_inconsistency_min(min_value): + max_value = series_max.value().at(min_value) + + if min_value > max_value: + return True + + return False + +# checks for logical inconsistency in DataSet +# eg. max value is less than min value +def logical_inconsistency(max_value): + min_val = series_min.value().at(max_value) + + if min_val > max_value: + return True + + return False + +# compares slopes between values +def slope_test(value, i): + p_a = value.prior(2)[0] + p_b = value.prior(2)[1] + p_c = value + + priorslp = 0.0 + + if p_a.isScalar() and p_b.isScalar(): + # x values + x1 = p_a + x2 = p_b + + # y values + y1 = p_a.intIndex() + y2 = p_b.intIndex() + + # current slope + priorslp = (y2-y1)/(x2-x1) + + # x ad y values for next point + x3 = p_c + y3 = p_c.intIndex() + + # next slope + nextslp = (y3-y2)/(x3-x2) + + if (abs(nextslp) < .1 #very sharp slope + and abs(nextslp) < abs(priorslp*.01) #big difference between the two slopes + and abs(nextslp) != float("inf") #slope is not a flat line + and abs(priorslp) != float("inf")): #slope is not a flat line + return True + + return False + +for i in range(10): + + timedeltas = [] + + print("\n Missing Value Test ---------------------") + start = time.time() + series_avg_2.datapoint().flag('Missing Value').missingValueTest(-9999) + end = time.time() + + print(end-start) + + timedeltas.append(end-start) + + print("\n Repeat Value Test ---------------------") + start = time.time() + series_avg_2.datapoint().flag("Repeat Value").when(rv_test) + end = time.time() + + print(end-start) + + timedeltas.append(end-start) + + print("\n Outlier Inconsistency ---------------------") + start = time.time() + series_avg_2.datapoint().flag("Outlier").when(range_test) + end = time.time() + + print(end-start) + + timedeltas.append(end-start) + + print("\n Spatial Inconsistency ---------------------") + start = time.time() + series_avg_2.datapoint().flag("Spatial Inconsistency").when(avg_spatial_inconsistency) + end = time.time() + + print(end-start) + + timedeltas.append(end-start) + + print("\n Logical Inconsistency ---------------------") + start = time.time() + series_max.datapoint().flag("Logical Inconsistency").when(logical_inconsistency) + end = time.time() + + print(end-start) + + timedeltas.append(end-start) + + print("\n Spike Test ---------------------") + start = time.time() + series_avg_2.datapoint().flag("Spike").when(slope_test) + end = time.time() + + print(end-start) + + timedeltas.append(end-start) + + with open('experiments_edsl.csv', 'a') as f: + f.write('\n') + for i, d in enumerate(timedeltas): + if i is len(timedeltas)-1: + f.write(str(d)) + else: + f.write(str(d)+', ')