Skip to content

Commit

Permalink
add thresholdPool and comparing control to Learn Storage
Browse files Browse the repository at this point in the history
  • Loading branch information
yangjufo committed May 18, 2018
1 parent 7bc2a5c commit 86e7b90
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 27 deletions.
68 changes: 42 additions & 26 deletions Learned_Storage.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pandas as pd
from Trained_NN import TrainedNN, ParameterPool, set_data_type
from Trained_NN import TrainedNN, ParameterPool, set_data_type, AbstractNN
from btree import BTree
from data.create_data import create_data_hash, Distribution
import time, json, math, getopt, sys
import time, json, math, getopt, sys, gc

STORE_NUMBER = 100001
BLOCK_SIZE = 100
Expand Down Expand Up @@ -34,8 +34,18 @@
Distribution.LOGNORMAL: "Lognormal"
}

thresholdPool = {
Distribution.RANDOM: [1, 1],
Distribution.EXPONENTIAL: [25, 10000]
}

def hybrid_training(threshold, stage_nums, core_nums, train_step_nums, batch_size_nums, learning_rate_nums,

useThresholdPool = {
Distribution.RANDOM: [True, True],
Distribution.EXPONENTIAL: [True, False],
}

def hybrid_training(threshold, use_threshold, stage_nums, core_nums, train_step_nums, batch_size_nums, learning_rate_nums,
keep_ratio_nums, train_data_x, train_data_y, test_data_x, test_data_y):
stage_length = len(stage_nums)
col_num = stage_nums[1]
Expand All @@ -60,16 +70,14 @@ def hybrid_training(threshold, stage_nums, core_nums, train_step_nums, batch_siz
test_labels.append(int(k * divisor))
else:
labels = tmp_labels[i][j]
test_labels = test_data_y
if i == 0:
index[i][j] = TrainedNN(1, core_nums[i], train_step_nums[i], batch_size_nums[i], learning_rate_nums[i],
keep_ratio_nums[i], inputs, labels, test_inputs, test_labels)
else:
index[i][j] = TrainedNN(threshold, core_nums[i], train_step_nums[i], batch_size_nums[i],
learning_rate_nums[i],
keep_ratio_nums[i], inputs, labels, test_inputs, test_labels)
index[i][j].train()

test_labels = test_data_y
tmp_index = TrainedNN(threshold[i], use_threshold[i], core_nums[i], train_step_nums[i], batch_size_nums[i],
learning_rate_nums[i],
keep_ratio_nums[i], inputs, labels, test_inputs, test_labels)
tmp_index.train()
index[i][j] = AbstractNN(tmp_index.get_weights(), tmp_index.get_bias(), core_nums[i], tmp_index.cal_err())
del tmp_index
gc.collect()
if i < stage_length - 1:
for ind in range(len(tmp_inputs[i][j])):
p = index[i][j].predict(tmp_inputs[i][j][ind])
Expand All @@ -81,15 +89,15 @@ def hybrid_training(threshold, stage_nums, core_nums, train_step_nums, batch_siz
for i in range(stage_nums[stage_length - 1]):
if index[stage_length - 1][i] is None:
continue
mean_abs_err = index[stage_length - 1][i].cal_err()
if mean_abs_err > threshold:
mean_abs_err = index[stage_length - 1][i].mean_err
if mean_abs_err > threshold[stage_length - 1]:
print("Using BTree")
index[stage_length - 1][i] = BTree(2)
index[stage_length - 1][i].build(tmp_inputs[stage_length - 1][i], tmp_labels[stage_length - 1][i])
return index


def learn_density(threshold, distribution, train_set_x, train_set_y, test_set_x, test_set_y):
def learn_density(threshold, use_threshold, distribution, train_set_x, train_set_y, test_set_x, test_set_y):
set_data_type(distribution)
if distribution == Distribution.RANDOM:
parameter = ParameterPool.RANDOM.value
Expand All @@ -112,7 +120,7 @@ def learn_density(threshold, distribution, train_set_x, train_set_y, test_set_x,
print("*************start Learned NN************")
print("Start Train")
start_time = time.time()
trained_index = hybrid_training(threshold, stage_set, core_set, train_step_set, batch_size_set, learning_rate_set,
trained_index = hybrid_training(threshold, use_threshold, stage_set, core_set, train_step_set, batch_size_set, learning_rate_set,
keep_ratio_set, train_set_x, train_set_y, test_set_x, test_set_y)
end_time = time.time()
learn_time = end_time - start_time
Expand All @@ -122,7 +130,7 @@ def learn_density(threshold, distribution, train_set_x, train_set_y, test_set_x,
return trained_index


def optimize_storage(threshold, data_part_distance, learning_percent, distribution):
def optimize_storage(doCompare, threshold, use_threshold, data_part_distance, learning_percent, distribution):
store_path = storePath[distribution]
to_store_path = toStorePath[distribution]

Expand All @@ -142,7 +150,7 @@ def optimize_storage(threshold, data_part_distance, learning_percent, distributi

to_store_data = pd.read_csv(to_store_path, header=None)
print("************Start Optimization**************")
trained_index = learn_density(threshold, distribution, train_set_x, train_set_y, test_set_x, test_set_y)
trained_index = learn_density(threshold, use_threshold, distribution, train_set_x, train_set_y, test_set_x, test_set_y)
stage_size = int(STORE_NUMBER / 10000)
min_value = train_set_x[0]
max_value = train_set_x[-1]
Expand Down Expand Up @@ -234,14 +242,18 @@ def optimize_storage(threshold, data_part_distance, learning_percent, distributi
end_time = time.time()
average_move_steps = (move_steps * 1.0 / to_store_data.shape[0])
average_move_time = (end_time - start_time) * 1.0 / to_store_data.shape[0]
average_insert_time = average_move_time + average_optimize_time
print("Average Move Steps: %f" % average_move_steps)
print("Average Move Time: %f" % average_move_time)
print("Average Insert Time: %f" % average_insert_time)
result = [{"Average Moving Steps": average_move_steps, "Average Moving Time": average_move_time,
"Average Optimizing Time": average_optimize_time}]
"Average Optimizing Time": average_optimize_time, "average_insert_time": average_insert_time}]
with open("store_performance/" + pathString[distribution] + "/optimization/" + str(data_part_distance) + "_" + str(
learning_percent) + ".json", "wb") as jsonFile:
json.dump(result, jsonFile)

if not doCompare:
return
print("************Without Optimization**************")
store_data = train_set_x[:]
move_steps = 0
Expand Down Expand Up @@ -272,11 +284,12 @@ def optimize_storage(threshold, data_part_distance, learning_percent, distributi


def show_help_message(msg):
help_message = {'command': 'python Learned_BTree.py -d <Distribution> [-p] [Percent] [-s] [Distance] [-h]',
help_message = {'command': 'python Learned_BTree.py -d <Distribution> [-p] [Percent] [-s] [Distance] [-c] [Compare] [-h]',
'distribution': 'Distribution: random, exponential',
'percent': 'Percent: 0.1-1.0, default value = 0.5; train data size = 100,000',
'distance': 'Random: 100-100,000, default = 1,000; '
'Exponential: 100,000-100,000,000, default = 1,000,000',
'compare': 'INTEGER, 0 for no comparing with no optimization, others for comparing',
'noDistributionError': 'Please choose the distribution first.'}
help_message_key = ['command', 'distribution', 'percent', 'distance']
if msg == 'all':
Expand All @@ -295,7 +308,7 @@ def main(argv):
is_distribution = False
distance = 1000
try:
opts, args = getopt.getopt(argv, "hd:s:p:")
opts, args = getopt.getopt(argv, "hd:s:p:c:")
except getopt.GetoptError:
show_help_message('command')
sys.exit(2)
Expand Down Expand Up @@ -338,6 +351,12 @@ def main(argv):
if not 100000 <= distance <= 100000000:
show_help_message('distance')
return
elif opt == '-c':
if not is_distribution:
show_help_message('noDistributionError')
return
doCompare = int(arg) == 1

else:
print("Unknown parameters, please use -h for instructions.")
return
Expand All @@ -346,10 +365,7 @@ def main(argv):
show_help_message('noDistributionError')
return
create_data_hash(distribution, per, num)
if distribution == Distribution.RANDOM:
optimize_storage(1, distance, per, Distribution.RANDOM)
elif distribution == Distribution.EXPONENTIAL:
optimize_storage(1000, distance, per, Distribution.EXPONENTIAL)
optimize_storage(doCompare, thresholdPool[distribution], useThresholdPool[distribution], distance, per, distribution)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion Trained_NN.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self, stages, cores, train_steps, batch_sizes, learning_rates, keep

class ParameterPool(Enum):
RANDOM = Parameter(stages=[1, 10], cores=[[1, 1], [1, 1]], train_steps=[20000, 20000],
batch_sizes=[50, 50], learning_rates=[0.0001, 0.001], keep_ratios=[1.0, 1.0])
batch_sizes=[50, 50], learning_rates=[0.0001, 0.0001], keep_ratios=[1.0, 1.0])
LOGNORMAL = Parameter(stages=[1, 100], cores=[[1, 16, 16, 1], [1, 8, 1]], train_steps=[2000, 400],
batch_sizes=[100, 50], learning_rates=[0.0001, 0.001], keep_ratios=[1.0, 0.9])
EXPONENTIAL = Parameter(stages=[1, 100], cores=[[1, 8, 1], [1, 8, 1]], train_steps=[30000, 20000],
Expand Down

0 comments on commit 86e7b90

Please sign in to comment.