add thresholdPool and comparing control to Learn Storage

ShihWenHsu · May 18, 2018 · 86e7b90 · 86e7b90
1 parent 7bc2a5c
commit 86e7b90
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 27 deletions.
diff --git a/Learned_Storage.py b/Learned_Storage.py
@@ -1,8 +1,8 @@
 import pandas as pd
-from Trained_NN import TrainedNN, ParameterPool, set_data_type
+from Trained_NN import TrainedNN, ParameterPool, set_data_type, AbstractNN
 from btree import BTree
 from data.create_data import create_data_hash, Distribution
-import time, json, math, getopt, sys
+import time, json, math, getopt, sys, gc
 
 STORE_NUMBER = 100001
 BLOCK_SIZE = 100
@@ -34,8 +34,18 @@
     Distribution.LOGNORMAL: "Lognormal"
 }
 
+thresholdPool = {
+    Distribution.RANDOM: [1, 1],    
+    Distribution.EXPONENTIAL: [25, 10000]
+}   
 
-def hybrid_training(threshold, stage_nums, core_nums, train_step_nums, batch_size_nums, learning_rate_nums,
+
+useThresholdPool = {
+    Distribution.RANDOM: [True, True],    
+    Distribution.EXPONENTIAL: [True, False],    
+}
+
+def hybrid_training(threshold, use_threshold, stage_nums, core_nums, train_step_nums, batch_size_nums, learning_rate_nums,
                     keep_ratio_nums, train_data_x, train_data_y, test_data_x, test_data_y):
     stage_length = len(stage_nums)
     col_num = stage_nums[1]
@@ -60,16 +70,14 @@ def hybrid_training(threshold, stage_nums, core_nums, train_step_nums, batch_siz
                     test_labels.append(int(k * divisor))
             else:
                 labels = tmp_labels[i][j]
-                test_labels = test_data_y
-            if i == 0:
-                index[i][j] = TrainedNN(1, core_nums[i], train_step_nums[i], batch_size_nums[i], learning_rate_nums[i],
-                                        keep_ratio_nums[i], inputs, labels, test_inputs, test_labels)
-            else:
-                index[i][j] = TrainedNN(threshold, core_nums[i], train_step_nums[i], batch_size_nums[i],
-                                        learning_rate_nums[i],
-                                        keep_ratio_nums[i], inputs, labels, test_inputs, test_labels)
-            index[i][j].train()
-
+                test_labels = test_data_y           
+            tmp_index = TrainedNN(threshold[i], use_threshold[i], core_nums[i], train_step_nums[i], batch_size_nums[i],
+                                    learning_rate_nums[i],
+                                    keep_ratio_nums[i], inputs, labels, test_inputs, test_labels)            
+            tmp_index.train()            
+            index[i][j] = AbstractNN(tmp_index.get_weights(), tmp_index.get_bias(), core_nums[i], tmp_index.cal_err())
+            del tmp_index
+            gc.collect()
             if i < stage_length - 1:
                 for ind in range(len(tmp_inputs[i][j])):
                     p = index[i][j].predict(tmp_inputs[i][j][ind])
@@ -81,15 +89,15 @@ def hybrid_training(threshold, stage_nums, core_nums, train_step_nums, batch_siz
     for i in range(stage_nums[stage_length - 1]):
         if index[stage_length - 1][i] is None:
             continue
-        mean_abs_err = index[stage_length - 1][i].cal_err()
-        if mean_abs_err > threshold:
+        mean_abs_err = index[stage_length - 1][i].mean_err
+        if mean_abs_err > threshold[stage_length - 1]:
             print("Using BTree")
             index[stage_length - 1][i] = BTree(2)
             index[stage_length - 1][i].build(tmp_inputs[stage_length - 1][i], tmp_labels[stage_length - 1][i])
     return index
 
 
-def learn_density(threshold, distribution, train_set_x, train_set_y, test_set_x, test_set_y):
+def learn_density(threshold, use_threshold, distribution, train_set_x, train_set_y, test_set_x, test_set_y):
     set_data_type(distribution)
     if distribution == Distribution.RANDOM:
         parameter = ParameterPool.RANDOM.value
@@ -112,7 +120,7 @@ def learn_density(threshold, distribution, train_set_x, train_set_y, test_set_x,
     print("*************start Learned NN************")
     print("Start Train")
     start_time = time.time()
-    trained_index = hybrid_training(threshold, stage_set, core_set, train_step_set, batch_size_set, learning_rate_set,
+    trained_index = hybrid_training(threshold, use_threshold, stage_set, core_set, train_step_set, batch_size_set, learning_rate_set,
                                     keep_ratio_set, train_set_x, train_set_y, test_set_x, test_set_y)
     end_time = time.time()
     learn_time = end_time - start_time
@@ -122,7 +130,7 @@ def learn_density(threshold, distribution, train_set_x, train_set_y, test_set_x,
     return trained_index
 
 
-def optimize_storage(threshold, data_part_distance, learning_percent, distribution):
+def optimize_storage(doCompare, threshold, use_threshold, data_part_distance, learning_percent, distribution):
     store_path = storePath[distribution]
     to_store_path = toStorePath[distribution]
 
@@ -142,7 +150,7 @@ def optimize_storage(threshold, data_part_distance, learning_percent, distributi
 
     to_store_data = pd.read_csv(to_store_path, header=None)
     print("************Start Optimization**************")
-    trained_index = learn_density(threshold, distribution, train_set_x, train_set_y, test_set_x, test_set_y)
+    trained_index = learn_density(threshold, use_threshold, distribution, train_set_x, train_set_y, test_set_x, test_set_y)
     stage_size = int(STORE_NUMBER / 10000)
     min_value = train_set_x[0]
     max_value = train_set_x[-1]
@@ -234,14 +242,18 @@ def optimize_storage(threshold, data_part_distance, learning_percent, distributi
     end_time = time.time()
     average_move_steps = (move_steps * 1.0 / to_store_data.shape[0])
     average_move_time = (end_time - start_time) * 1.0 / to_store_data.shape[0]
+    average_insert_time = average_move_time + average_optimize_time
     print("Average Move Steps: %f" % average_move_steps)
     print("Average Move Time: %f" % average_move_time)
+    print("Average Insert Time: %f" % average_insert_time)
     result = [{"Average Moving Steps": average_move_steps, "Average Moving Time": average_move_time,
-               "Average Optimizing Time": average_optimize_time}]
+               "Average Optimizing Time": average_optimize_time, "average_insert_time": average_insert_time}]
     with open("store_performance/" + pathString[distribution] + "/optimization/" + str(data_part_distance) + "_" + str(
             learning_percent) + ".json", "wb") as jsonFile:
         json.dump(result, jsonFile)
 
+    if not doCompare:
+        return
     print("************Without Optimization**************")
     store_data = train_set_x[:]
     move_steps = 0
@@ -272,11 +284,12 @@ def optimize_storage(threshold, data_part_distance, learning_percent, distributi
 
 
 def show_help_message(msg):
-    help_message = {'command': 'python Learned_BTree.py -d <Distribution> [-p] [Percent] [-s] [Distance] [-h]',
+    help_message = {'command': 'python Learned_BTree.py -d <Distribution> [-p] [Percent] [-s] [Distance] [-c] [Compare] [-h]',
                     'distribution': 'Distribution: random, exponential',
                     'percent': 'Percent: 0.1-1.0, default value = 0.5; train data size = 100,000',
                     'distance': 'Random: 100-100,000, default = 1,000; '
                                 'Exponential: 100,000-100,000,000, default = 1,000,000',
+                    'compare': 'INTEGER, 0 for no comparing with no optimization, others for comparing',
                     'noDistributionError': 'Please choose the distribution first.'}
     help_message_key = ['command', 'distribution', 'percent', 'distance']
     if msg == 'all':
@@ -295,7 +308,7 @@ def main(argv):
     is_distribution = False
     distance = 1000
     try:
-        opts, args = getopt.getopt(argv, "hd:s:p:")
+        opts, args = getopt.getopt(argv, "hd:s:p:c:")
     except getopt.GetoptError:
         show_help_message('command')
         sys.exit(2)
@@ -338,6 +351,12 @@ def main(argv):
                 if not 100000 <= distance <= 100000000:
                     show_help_message('distance')
                     return
+        elif opt == '-c':
+            if not is_distribution:
+                show_help_message('noDistributionError')
+                return
+            doCompare = int(arg) == 1
+
         else:
             print("Unknown parameters, please use -h for instructions.")
             return
@@ -346,10 +365,7 @@ def main(argv):
         show_help_message('noDistributionError')
         return
     create_data_hash(distribution, per, num)
-    if distribution == Distribution.RANDOM:
-        optimize_storage(1, distance, per, Distribution.RANDOM)
-    elif distribution == Distribution.EXPONENTIAL:
-        optimize_storage(1000, distance, per, Distribution.EXPONENTIAL)
+    optimize_storage(doCompare, thresholdPool[distribution], useThresholdPool[distribution], distance, per, distribution)
 
 
 if __name__ == "__main__":

diff --git a/Trained_NN.py b/Trained_NN.py
@@ -37,7 +37,7 @@ def __init__(self, stages, cores, train_steps, batch_sizes, learning_rates, keep
 
 class ParameterPool(Enum):
     RANDOM = Parameter(stages=[1, 10], cores=[[1, 1], [1, 1]], train_steps=[20000, 20000],
-                       batch_sizes=[50, 50], learning_rates=[0.0001, 0.001], keep_ratios=[1.0, 1.0])
+                       batch_sizes=[50, 50], learning_rates=[0.0001, 0.0001], keep_ratios=[1.0, 1.0])
     LOGNORMAL = Parameter(stages=[1, 100], cores=[[1, 16, 16, 1], [1, 8, 1]], train_steps=[2000, 400],
                           batch_sizes=[100, 50], learning_rates=[0.0001, 0.001], keep_ratios=[1.0, 0.9])
     EXPONENTIAL = Parameter(stages=[1, 100], cores=[[1, 8, 1], [1, 8, 1]], train_steps=[30000, 20000],