Skip to content

Commit 0e2af52

Browse files
authored
Add 2 examples (aimclub#1024)
* Add example for kc2 dataset * Add experiment script * Add baseline to kc2 example * Add requirements for example reproducibility
1 parent 86d7158 commit 0e2af52

File tree

3 files changed

+242
-0
lines changed

3 files changed

+242
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
import os
2+
from datetime import timedelta
3+
from typing import Sequence, Optional
4+
5+
import numpy as np
6+
from matplotlib import pyplot as plt
7+
from sklearn.metrics import roc_auc_score as roc_auc
8+
9+
from fedot.core.composer.composer_builder import ComposerBuilder
10+
from fedot.core.data.data import InputData
11+
from fedot.core.optimisers.gp_comp.gp_params import GPGraphOptimizerParameters
12+
from fedot.core.optimisers.gp_comp.operators.crossover import CrossoverTypesEnum
13+
from fedot.core.optimisers.gp_comp.operators.inheritance import GeneticSchemeTypesEnum
14+
from fedot.core.optimisers.gp_comp.operators.mutation import MutationTypesEnum
15+
from fedot.core.optimisers.gp_comp.pipeline_composer_requirements import PipelineComposerRequirements
16+
from fedot.core.optimisers.opt_history_objects.opt_history import OptHistory
17+
from fedot.core.repository.operation_types_repository import get_operations_for_task
18+
from fedot.core.repository.quality_metrics_repository import ClassificationMetricsEnum
19+
from fedot.core.repository.tasks import Task, TaskTypesEnum
20+
from fedot.core.utils import fedot_project_root
21+
22+
mutation_labels = [
23+
'Mutation simple',
24+
'Mutation growth',
25+
'Mutation reduce',
26+
'Mutation all',
27+
]
28+
29+
crossover_labels = [
30+
'Crossover one point',
31+
'Crossover subtree',
32+
'Crossover all',
33+
]
34+
35+
36+
def run_single(train_data,
37+
test_data,
38+
mutation_types,
39+
crossover_types,
40+
timeout: Optional[float] = 10,
41+
num_generations: int = 20,
42+
visualize: bool = False):
43+
task = Task(TaskTypesEnum.classification)
44+
ops = get_operations_for_task(task)
45+
requirements = PipelineComposerRequirements(
46+
primary=ops,
47+
secondary=ops,
48+
num_of_generations=num_generations,
49+
timeout=timedelta(minutes=timeout) if timeout else None,
50+
early_stopping_iterations=None,
51+
n_jobs=-1,
52+
)
53+
gp_params = GPGraphOptimizerParameters(
54+
genetic_scheme_type=GeneticSchemeTypesEnum.generational,
55+
mutation_types=mutation_types,
56+
crossover_types=crossover_types,
57+
)
58+
59+
composer = ComposerBuilder(task=Task(TaskTypesEnum.classification)). \
60+
with_metrics(ClassificationMetricsEnum.ROCAUC). \
61+
with_requirements(requirements). \
62+
with_optimizer_params(gp_params). \
63+
build()
64+
65+
pipeline = composer.compose_pipeline(train_data)
66+
pipeline.fit_from_scratch(train_data)
67+
predicted = pipeline.predict(test_data)
68+
69+
roc_auc_metric = roc_auc(y_true=test_data.target, y_score=predicted.predict)
70+
print('roc_auc=', roc_auc_metric)
71+
72+
if visualize:
73+
pipeline.show()
74+
composer.history.show.fitness_line()
75+
76+
return composer.history
77+
78+
79+
def load_histories(history_dir, filename_filter=None):
80+
hs = []
81+
for obj in os.listdir(history_dir):
82+
fullpath = f'{history_dir}/{obj}'
83+
if not os.path.isfile(fullpath):
84+
continue
85+
if filename_filter and filename_filter not in str(obj):
86+
continue
87+
history = OptHistory.load(fullpath)
88+
hs.append(history)
89+
return hs
90+
91+
92+
def visualize_histories(histories: Sequence[OptHistory],
93+
labels: Sequence[str],
94+
with_confidence_interval: bool = True,
95+
):
96+
best_num = 5
97+
for history, label in zip(histories, labels):
98+
h = history.historical_fitness[1:-1] # without initial and last pop
99+
best_fitness = np.abs(np.array([np.min(pop) for pop in h]))
100+
101+
ys = best_fitness
102+
xs = np.arange(0, len(best_fitness))
103+
plt.xticks(xs)
104+
plt.plot(xs, ys, label=label)
105+
106+
if with_confidence_interval:
107+
best_num = min(len(xs), best_num)
108+
std_fitness = np.array([np.std(sorted(pop)[:best_num]) for pop in h])
109+
plt.fill_between(xs, ys + std_fitness, ys - std_fitness, alpha=0.2)
110+
111+
plt.xlabel('Поколение')
112+
plt.ylabel('Метрика')
113+
plt.legend()
114+
plt.show()
115+
116+
117+
def run_experiment(train_data_path,
118+
test_data_path,
119+
save_dir,
120+
timeout_per_run: Optional[float] = 10,
121+
num_generations: int = 20,
122+
):
123+
train_data = InputData.from_csv(train_data_path, target_columns='target')
124+
test_data = InputData.from_csv(test_data_path, target_columns='target')
125+
126+
all_mutations = [MutationTypesEnum.simple, MutationTypesEnum.growth, MutationTypesEnum.reduce]
127+
mutation_types = [
128+
[MutationTypesEnum.simple],
129+
[MutationTypesEnum.growth],
130+
[MutationTypesEnum.reduce],
131+
all_mutations,
132+
]
133+
all_crossovers = [CrossoverTypesEnum.one_point, CrossoverTypesEnum.subtree]
134+
crossover_types = [
135+
[CrossoverTypesEnum.one_point],
136+
[CrossoverTypesEnum.subtree],
137+
all_crossovers,
138+
]
139+
140+
mutation_histories = []
141+
for label, mutations in zip(mutation_labels, mutation_types):
142+
label = label.lower().replace(' ', '_')
143+
history_file_path = f'{save_dir}/{label}.json'
144+
145+
history = run_single(train_data, test_data,
146+
timeout=timeout_per_run,
147+
num_generations=num_generations,
148+
mutation_types=mutations,
149+
crossover_types=all_crossovers)
150+
mutation_histories.append(history)
151+
print(f'history is saved to path {history_file_path}')
152+
history.save(history_file_path)
153+
154+
crossover_histories = []
155+
for label, crossover in zip(crossover_labels, crossover_types):
156+
label = label.lower().replace(' ', '_')
157+
history_file_path = f'{save_dir}/{label}.json'
158+
159+
history = run_single(train_data, test_data,
160+
timeout=timeout_per_run,
161+
mutation_types=all_mutations,
162+
crossover_types=crossover)
163+
crossover_histories.append(history)
164+
print(f'history is saved to path {history_file_path}')
165+
history.save(history_file_path)
166+
167+
visualize_histories(mutation_histories, mutation_labels)
168+
visualize_histories(crossover_histories, crossover_labels)
169+
170+
171+
def run_experiment_with_saved_histories(save_dir):
172+
mutation_histories = load_histories(save_dir, 'mutation')
173+
visualize_histories(mutation_histories, mutation_labels)
174+
175+
crossover_histories = load_histories(save_dir, 'crossover')
176+
visualize_histories(crossover_histories, crossover_labels)
177+
178+
179+
if __name__ == '__main__':
180+
train_data_path = f'{fedot_project_root()}/cases/data/scoring/scoring_train.csv'
181+
test_data_path = f'{fedot_project_root()}/cases/data/scoring/scoring_test.csv'
182+
183+
run_experiment(train_data_path,
184+
test_data_path,
185+
save_dir='result_histories',
186+
timeout_per_run=None,
187+
num_generations=20,
188+
)
+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
fedot==0.6.1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from os.path import join
2+
3+
from fedot.api.main import Fedot
4+
from fedot.core.data.data import InputData
5+
from fedot.core.data.data_split import train_test_data_setup
6+
from fedot.core.repository.tasks import TaskTypesEnum, Task
7+
from fedot.core.utils import fedot_project_root
8+
9+
10+
def get_kc2_data():
11+
file_path = 'cases/data/kc2/kc2.csv'
12+
full_path = join(str(fedot_project_root()), file_path)
13+
task = Task(TaskTypesEnum.classification)
14+
data = InputData.from_csv(full_path, task=task, target_columns='problems')
15+
16+
target = data.target
17+
encoded = (target == 'yes').astype(int)
18+
data.target = encoded
19+
20+
train, test = train_test_data_setup(data, shuffle_flag=True)
21+
22+
return train, test
23+
24+
25+
def run_classification(train_data, test_data,
26+
timeout: float = 5, visualize=False):
27+
auto_model = Fedot(problem='classification',
28+
timeout=timeout, n_jobs=8,
29+
early_stopping_iterations=None, )
30+
auto_model.fit(features=train_data.features, target=train_data.target)
31+
prediction = auto_model.predict(features=test_data.features)
32+
metrics = auto_model.get_metrics(target=test_data.target)
33+
print(metrics)
34+
if visualize:
35+
auto_model.current_pipeline.show()
36+
auto_model.plot_prediction()
37+
return prediction
38+
39+
40+
def run_classification_baseline(train_data, test_data, timeout: float = 5):
41+
baseline_model = Fedot(problem='classification', timeout=timeout)
42+
baseline_model.fit(features=train_data.features, target=train_data.target,
43+
predefined_model='rf')
44+
baseline_model.predict(features=test_data.features)
45+
metrics = baseline_model.get_metrics(target=test_data.target)
46+
print(metrics)
47+
48+
49+
if __name__ == '__main__':
50+
train_data, test_data = get_kc2_data()
51+
run_classification_baseline(train_data, test_data)
52+
run_classification(train_data, test_data,
53+
timeout=5.0, visualize=True)

0 commit comments

Comments
 (0)