Skip to content

Commit

Permalink
Merge pull request EpistasisLab#370 from weixuanfu2016/from_string_func
Browse files Browse the repository at this point in the history
Fix issue for building pipeline from string in unit tests
  • Loading branch information
rhiever authored Mar 16, 2017
2 parents 1a44518 + 8ce8bbb commit 91a102b
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 50 deletions.
117 changes: 76 additions & 41 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def test_random_ind_2():
exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
"""
assert expected_code == export_pipeline(pipeline, tpot_obj.operators)
assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)

def test_score():
"""Assert that the TPOT score function raises a ValueError when no optimized pipeline exists"""
Expand All @@ -198,17 +198,17 @@ def test_score():


def test_score_2():
"""Assert that the TPOTClassifier score function outputs a known score for a ramdom pipeline"""
"""Assert that the TPOTClassifier score function outputs a known score for a fix pipeline"""

tpot_obj = TPOTClassifier(random_state=43)
tpot_obj._pbar = tqdm(total=1, disable=True)
known_score = 0.96710588996037627 # Assumes use of the TPOT balanced_accuracy function
tpot_obj = TPOTClassifier()
known_score = 0.987691257357 # Assumes use of the TPOT balanced_accuracy function

# Reify pipeline with known score
tpot_obj._optimized_pipeline = tpot_obj._toolbox.individual()
pipeline_string= ('KNeighborsClassifier(input_matrix, KNeighborsClassifier__n_neighbors=10, '
'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform)')
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features, training_classes)

# Get score from TPOT
score = tpot_obj.score(testing_features, testing_classes)

Expand All @@ -219,14 +219,22 @@ def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
assert isclose(known_score, score)

def test_score_3():
"""Assert that the TPOTRegressor score function outputs a known score for a random pipeline"""
"""Assert that the TPOTRegressor score function outputs a known score for a fix pipeline"""

tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error', random_state=53)
tpot_obj._pbar = tqdm(total=1, disable=True)
known_score = 15.724128278216726 # Assumes use of mse
tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error')
known_score = 11.2010824752 # Assumes use of mse

# Reify pipeline with known score
tpot_obj._optimized_pipeline = tpot_obj._toolbox.individual()

pipeline_string = ("ExtraTreesRegressor("
"GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8,"
"GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber,"
"GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5,"
"GradientBoostingRegressor__min_samples_leaf=5, GradientBoostingRegressor__min_samples_split=5,"
"GradientBoostingRegressor__subsample=0.25),"
"ExtraTreesRegressor__bootstrap=True,ExtraTreesRegressor__max_features=0.5,"
"ExtraTreesRegressor__min_samples_leaf=5,ExtraTreesRegressor__min_samples_split=5)")
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r)

Expand Down Expand Up @@ -256,8 +264,11 @@ def test_predict():
def test_predict_2():
"""Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)"""

tpot_obj = TPOTClassifier(random_state=49)
tpot_obj._optimized_pipeline = tpot_obj._toolbox.individual()
tpot_obj = TPOTClassifier()
pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
'DecisionTreeClassifier__min_samples_split=5)')
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features, training_classes)

Expand All @@ -269,8 +280,11 @@ def test_predict_2():
def test_predict_proba():
"""Assert that the TPOT predict_proba function returns a numpy matrix of shape (num_testing_rows, num_testing_classes)"""

tpot_obj = TPOTClassifier(random_state=51)
tpot_obj._optimized_pipeline = tpot_obj._toolbox.individual()
tpot_obj = TPOTClassifier()
pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
'DecisionTreeClassifier__min_samples_split=5)')
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features, training_classes)

Expand All @@ -283,8 +297,11 @@ def test_predict_proba():
def test_predict_proba2():
"""Assert that the TPOT predict_proba function returns a numpy matrix filled with probabilities (float)"""

tpot_obj = TPOTClassifier(random_state=53)
tpot_obj._optimized_pipeline = tpot_obj._toolbox.individual()
tpot_obj = TPOTClassifier()
pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
'DecisionTreeClassifier__min_samples_split=5)')
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features, training_classes)

Expand Down Expand Up @@ -456,8 +473,15 @@ def test_generate_import_code():

def test_mutNodeReplacement():
"""Assert that mutNodeReplacement() returns the correct type of mutation node in a fixed pipeline"""
tpot_obj = TPOTClassifier(random_state=42)
pipeline = tpot_obj._toolbox.individual()
tpot_obj = TPOTClassifier()
pipeline_string= ('KNeighborsClassifier(CombineDFs('
'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
'DecisionTreeClassifier__min_samples_split=5),SelectKBest(input_matrix, SelectKBest__k=20)'
'KNeighborsClassifier__n_neighbors=10, '
'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform')
pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
pipeline[0].ret = Output_DF
old_ret_type_list = [node.ret for node in pipeline]
old_prims_list = [node for node in pipeline if node.arity != 0]
mut_ind = mutNodeReplacement(pipeline, pset = tpot_obj._pset)
Expand All @@ -474,17 +498,23 @@ def test_mutNodeReplacement():
def test_export_pipeline():
"""Assert that exported_pipeline() generated a compile source file as expected given a fixed complex pipeline"""
tpot_obj = TPOTClassifier()
pipeline = creator.Individual.\
from_string("GaussianNB(CombineDFs(ZeroCount(input_matrix), RobustScaler(input_matrix)))", tpot_obj._pset)
pipeline_string= ('KNeighborsClassifier(CombineDFs('
'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
'DecisionTreeClassifier__min_samples_split=5),SelectKBest(input_matrix, SelectKBest__k=20)'
'KNeighborsClassifier__n_neighbors=10, '
'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform')
pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)

expected_code = """import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, RobustScaler
from tpot.build_in_operators import ZeroCount
from sklearn.preprocessing import FunctionTransformer
from sklearn.tree import DecisionTreeClassifier
# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
Expand All @@ -494,53 +524,58 @@ def test_export_pipeline():
exported_pipeline = make_pipeline(
make_union(
ZeroCount(),
RobustScaler()
make_union(VotingClassifier([('branch',
DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)
)]), FunctionTransformer(lambda X: X)),
SelectKBest(score_func=f_classif, k=20)
),
GaussianNB()
KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform")
)
exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
"""
assert expected_code == export_pipeline(pipeline,tpot_obj.operators)
assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)


def test_export_pipeline_2():
"""Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline (only one classifier)"""
tpot_obj = TPOTClassifier()
pipeline = creator.Individual.\
from_string("GaussianNB(input_matrix)", tpot_obj._pset)
pipeline_string= ('KNeighborsClassifier(input_matrix, KNeighborsClassifier__n_neighbors=10, '
'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform)')
pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
expected_code = """import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = \\
train_test_split(features, tpot_data['class'], random_state=42)
exported_pipeline = GaussianNB()
exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform")
exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
"""
assert expected_code == export_pipeline(pipeline, tpot_obj.operators)
assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)

def test_export_pipeline_3():
"""Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with a preprocessor"""
tpot_obj = TPOTClassifier()
pipeline = creator.Individual.\
from_string("GaussianNB(MaxAbsScaler(input_matrix))", tpot_obj._pset)
pipeline_string= ('DecisionTreeClassifier(SelectKBest(input_matrix, SelectKBest__k=20),'
'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,'
'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)')
pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)

expected_code = """import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.tree import DecisionTreeClassifier
# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
Expand All @@ -549,14 +584,14 @@ def test_export_pipeline_3():
train_test_split(features, tpot_data['class'], random_state=42)
exported_pipeline = make_pipeline(
MaxAbsScaler(),
GaussianNB()
SelectKBest(score_func=f_classif, k=20),
DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)
)
exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
"""
assert expected_code == export_pipeline(pipeline, tpot_obj.operators)
assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)


def test_operator_export():
Expand Down
11 changes: 7 additions & 4 deletions tpot/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,8 +302,10 @@ def _setup_pset(self):

# Terminals
for _type in self.arguments:
for val in _type.values:
self._pset.addTerminal(val, _type)
type_values = list(_type.values) + ['MISSING']
for val in type_values:
terminal_name = _type.__name__ + "=" + str(val)
self._pset.addTerminal(val, _type, name=terminal_name)

if self.verbosity > 2:
print('{} operators are imported.'.format(len(self.operators)))
Expand Down Expand Up @@ -571,7 +573,7 @@ def export(self, output_file_name):
raise ValueError('A pipeline has not yet been optimized. Please call fit() first.')

with open(output_file_name, 'w') as output_file:
output_file.write(export_pipeline(self._optimized_pipeline, self.operators))
output_file.write(export_pipeline(self._optimized_pipeline, self.operators, self._pset))

def _compile_to_sklearn(self, expr):
"""Compiles a DEAP pipeline into a sklearn pipeline
Expand All @@ -585,7 +587,7 @@ def _compile_to_sklearn(self, expr):
-------
sklearn_pipeline: sklearn.pipeline.Pipeline
"""
sklearn_pipeline = generate_pipeline_code(expr_to_tree(expr), self.operators)
sklearn_pipeline = generate_pipeline_code(expr_to_tree(expr, self._pset), self.operators)
return eval(sklearn_pipeline, self.operators_context)

def _set_param_recursive(self, pipeline_steps, parameter, value):
Expand Down Expand Up @@ -792,6 +794,7 @@ def _wrapped_cross_val_score(sklearn_pipeline, features=features, classes=classe
def _mate_operator(self, ind1, ind2):
return gp.cxOnePoint(ind1, ind2)


@_pre_test
def _random_mutation_operator(self, individual):
"""Perform a replacement, insert, or shrink mutation on an individual
Expand Down
4 changes: 3 additions & 1 deletion tpot/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def check_pipeline(self, *args, **kwargs):
expr_tuple = expr if isinstance(expr, tuple) else (expr,)
for expr_test in expr_tuple:
#print(num_test, generate_pipeline_code(expr_to_tree(expr), self.operators)) # debug
sklearn_pipeline = eval(generate_pipeline_code(expr_to_tree(expr_test), self.operators), self.operators_context)
sklearn_pipeline = eval(generate_pipeline_code(expr_to_tree(expr_test, self._pset), self.operators), self.operators_context)
if self.classification:
sklearn_pipeline.fit(pretest_X, pretest_y)
else:
Expand All @@ -164,5 +164,7 @@ def check_pipeline(self, *args, **kwargs):
pass
finally:
num_test += 1

return expr

return check_pipeline
11 changes: 7 additions & 4 deletions tpot/export_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def get_by_name(opname, operators):
ret_op_class = ret_op_classes[0]
return ret_op_class

def export_pipeline(exported_pipeline, operators):
def export_pipeline(exported_pipeline, operators, pset):
"""Generates the source code of a TPOT Pipeline
Parameters
Expand All @@ -62,7 +62,7 @@ def export_pipeline(exported_pipeline, operators):
"""
# Unroll the nested function calls into serial code
pipeline_tree = expr_to_tree(exported_pipeline)
pipeline_tree = expr_to_tree(exported_pipeline, pset)

# Have the exported code import all of the necessary modules and functions
pipeline_text = generate_import_code(exported_pipeline, operators)
Expand All @@ -73,7 +73,7 @@ def export_pipeline(exported_pipeline, operators):
return pipeline_text


def expr_to_tree(ind):
def expr_to_tree(ind, pset):
"""Convert the unstructured DEAP pipeline into a tree data-structure
Parameters
Expand All @@ -95,7 +95,10 @@ def expr_to_tree(ind):
"""
def prim_to_list(prim, args):
if isinstance(prim, deap.gp.Terminal):
return prim.value
if prim.name in pset.context:
return pset.context[prim.name]
else:
return prim.value

return [prim.name] + args

Expand Down
2 changes: 2 additions & 0 deletions tpot/operator_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,8 @@ def export(cls, *args):
if dep_op_list:
dep_op_arguments = {}
for arg_class, arg_value in zip(arg_types, args):
if arg_value == "MISSING":
continue
aname_split = arg_class.__name__.split('__')
if isinstance(arg_value, str):
arg_value = '\"{}\"'.format(arg_value)
Expand Down

0 comments on commit 91a102b

Please sign in to comment.