Skip to content

Commit

Permalink
warning -> info for low cost partial config (microsoft#231)
Browse files Browse the repository at this point in the history
* warning -> info for low cost partial config
microsoft#195, microsoft#110

* when n_estimators < 0, use trained_estimator's

* log debug info

* test random seed

* remove "objective"; avoid ZeroDivisionError

* hp config to estimator params

* check type of searcher

* default n_jobs

* try import

* Update searchalgo_auto.py

* CLASSIFICATION

* auto_augment flag

* min_sample_size

* make catboost optional
  • Loading branch information
sonichi authored Oct 8, 2021
1 parent a99e939 commit f48ca26
Show file tree
Hide file tree
Showing 22 changed files with 1,941 additions and 1,862 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ print(automl.model)

```python
from flaml import AutoML
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing
# Initialize an AutoML instance
automl = AutoML()
# Specify automl goal and constraint
Expand All @@ -113,7 +113,7 @@ automl_settings = {
"task": 'regression',
"log_file_name": "test/boston.log",
}
X_train, y_train = load_boston(return_X_y=True)
X_train, y_train = fetch_california_housing(return_X_y=True)
# Train with labeled input data
automl.fit(X_train=X_train, y_train=y_train,
**automl_settings)
Expand Down
66 changes: 39 additions & 27 deletions flaml/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
N_SPLITS,
SAMPLE_MULTIPLY_FACTOR,
)
from .data import concat
from .data import concat, CLASSIFICATION
from . import tune
from .training_log import training_log_reader, training_log_writer

Expand Down Expand Up @@ -619,7 +619,8 @@ def _prepare_data(self, eval_method, split_ratio, n_splits):
if issparse(X_train_all):
X_train_all = X_train_all.tocsr()
if (
self._state.task in ("binary", "multi")
self._state.task in CLASSIFICATION
and self._auto_augment
and self._state.fit_kwargs.get("sample_weight") is None
and self._split_type not in ["time", "group"]
):
Expand Down Expand Up @@ -725,7 +726,7 @@ def _prepare_data(self, eval_method, split_ratio, n_splits):
y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
self._state.groups = self._state.groups_all[train_idx]
self._state.groups_val = self._state.groups_all[val_idx]
elif self._state.task in ("binary", "multi"):
elif self._state.task in CLASSIFICATION:
# for classification, make sure the labels are complete in both
# training and validation data
label_set, first = np.unique(y_train_all, return_index=True)
Expand Down Expand Up @@ -904,10 +905,11 @@ def retrain_from_log(
n_splits=N_SPLITS,
split_type=None,
groups=None,
n_jobs=1,
n_jobs=-1,
train_best=True,
train_full=False,
record_id=-1,
auto_augment=True,
**fit_kwargs,
):
"""Retrain from log file
Expand Down Expand Up @@ -943,7 +945,8 @@ def retrain_from_log(
groups: None or array-like | Group labels (with matching length to
y_train) or groups counts (with sum equal to length of y_train)
for training data.
n_jobs: An integer of the number of threads for training.
n_jobs: An integer of the number of threads for training. Use all
available resources when n_jobs == -1.
train_best: A boolean of whether to train the best config in the
time budget; if false, train the last config in the budget.
train_full: A boolean of whether to train on the full data. If true,
Expand All @@ -952,6 +955,8 @@ def retrain_from_log(
be retrained. By default `record_id = -1` which means this will be
ignored. `record_id = 0` corresponds to the first trial, and
when `record_id >= 0`, `time_budget` will be ignored.
auto_augment: boolean, default=True | Whether to automatically
augment rare classes.
**fit_kwargs: Other key word arguments to pass to fit() function of
the searched learners, such as sample_weight.
"""
Expand Down Expand Up @@ -1018,6 +1023,7 @@ def retrain_from_log(
elif eval_method == "auto":
eval_method = self._decide_eval_method(time_budget)
self.modelcount = 0
self._auto_augment = auto_augment
self._prepare_data(eval_method, split_ratio, n_splits)
self._state.time_budget = None
self._state.n_jobs = n_jobs
Expand All @@ -1032,7 +1038,7 @@ def _decide_split_type(self, split_type):
self._state.task = get_classification_objective(
len(np.unique(self._y_train_all))
)
if self._state.task in ("binary", "multi"):
if self._state.task in CLASSIFICATION:
assert split_type in [None, "stratified", "uniform", "time", "group"]
self._split_type = (
split_type or self._state.groups is None and "stratified" or "group"
Expand Down Expand Up @@ -1191,7 +1197,7 @@ def min_resource(self) -> Optional[float]:
Returns:
A float for the minimal sample size or None
"""
return MIN_SAMPLE_TRAIN if self._sample else None
return self._min_sample_size if self._sample else None

@property
def max_resource(self) -> Optional[float]:
Expand Down Expand Up @@ -1282,7 +1288,7 @@ def fit(
sample_weight_val=None,
groups_val=None,
groups=None,
verbose=1,
verbose=3,
retrain_full=True,
split_type=None,
learner_selector="sample",
Expand All @@ -1291,8 +1297,10 @@ def fit(
seed=None,
n_concurrent_trials=1,
keep_search_state=False,
append_log=False,
early_stop=False,
append_log=False,
auto_augment=True,
min_sample_size=MIN_SAMPLE_TRAIN,
**fit_kwargs,
):
"""Find a model for a given task
Expand Down Expand Up @@ -1375,7 +1383,7 @@ def custom_metric(
groups: None or array-like | Group labels (with matching length to
y_train) or groups counts (with sum equal to length of y_train)
for training data.
verbose: int, default=1 | Controls the verbosity, higher means more
verbose: int, default=3 | Controls the verbosity, higher means more
messages.
retrain_full: bool or str, default=True | whether to retrain the
selected model on the full training data when using holdout.
Expand Down Expand Up @@ -1412,8 +1420,12 @@ def custom_metric(
saving.
early_stop: boolean, default=False | Whether to stop early if the
search is considered to converge.
append_log: boolean, default=False | whetehr to directly append the log
append_log: boolean, default=False | Whetehr to directly append the log
records to the input log file if it exists.
auto_augment: boolean, default=True | Whether to automatically
augment rare classes.
min_sample_size: int, default=MIN_SAMPLE_TRAIN | the minimal sample
size when sample=True.
**fit_kwargs: Other key word arguments to pass to fit() function of
the searched learners, such as sample_weight. Include period as
a key word argument for 'forecast' task.
Expand All @@ -1435,8 +1447,8 @@ def custom_metric(
self._learner_selector = learner_selector
old_level = logger.getEffectiveLevel()
self.verbose = verbose
if verbose == 0:
logger.setLevel(logging.WARNING)
# if verbose == 0:
logger.setLevel(50 - verbose * 10)
if (not mlflow or not mlflow.active_run()) and not logger.handlers:
# Add the console handler.
_ch = logging.StreamHandler()
Expand All @@ -1457,12 +1469,14 @@ def custom_metric(
and (eval_method == "holdout" and self._state.X_val is None)
or (eval_method == "cv")
)
self._auto_augment = auto_augment
self._min_sample_size = min_sample_size
self._prepare_data(eval_method, split_ratio, n_splits)
self._sample = (
sample
and task != "rank"
and eval_method != "cv"
and (MIN_SAMPLE_TRAIN * SAMPLE_MULTIPLY_FACTOR < self._state.data_size)
and (self._min_sample_size * SAMPLE_MULTIPLY_FACTOR < self._state.data_size)
)
if "auto" == metric:
if "binary" in self._state.task:
Expand Down Expand Up @@ -1584,8 +1598,8 @@ def custom_metric(
for state in self._search_states.values():
if state.trained_estimator:
del state.trained_estimator
if verbose == 0:
logger.setLevel(old_level)
# if verbose == 0:
logger.setLevel(old_level)

def _search_parallel(self):
try:
Expand Down Expand Up @@ -1631,6 +1645,8 @@ def _search_parallel(self):
points_to_evaluate=points_to_evaluate,
)
else:
self._state.time_from_start = time.time() - self._start_time_flag
time_left = self._state.time_budget - self._state.time_from_start
search_alg = SearchAlgo(
metric="val_loss",
space=space,
Expand All @@ -1645,13 +1661,9 @@ def _search_parallel(self):
],
metric_constraints=self.metric_constraints,
seed=self._seed,
time_budget_s=time_left,
)
search_alg = ConcurrencyLimiter(search_alg, self._n_concurrent_trials)
self._state.time_from_start = time.time() - self._start_time_flag
time_left = self._state.time_budget - self._state.time_from_start
search_alg.set_search_properties(
None, None, config={"time_budget_s": time_left}
)
resources_per_trial = (
{"cpu": self._state.n_jobs} if self._state.n_jobs > 1 else None
)
Expand Down Expand Up @@ -1782,7 +1794,7 @@ def _search_sequential(self):
search_space = search_state.search_space
if self._sample:
prune_attr = "FLAML_sample_size"
min_resource = MIN_SAMPLE_TRAIN
min_resource = self._min_sample_size
max_resource = self._state.data_size
else:
prune_attr = min_resource = max_resource = None
Expand Down Expand Up @@ -1840,10 +1852,10 @@ def _search_sequential(self):
else:
search_space = None
if self._hpo_method in ("bs", "cfo", "cfocat"):
search_state.search_alg.set_search_properties(
search_state.search_alg.searcher.set_search_properties(
metric=None,
mode=None,
config={
setting={
"metric_target": self._state.best_loss,
},
)
Expand All @@ -1852,7 +1864,7 @@ def _search_sequential(self):
search_state.training_function,
search_alg=search_state.search_alg,
time_budget_s=min(budget_left, self._state.train_time_limit),
verbose=max(self.verbose - 1, 0),
verbose=max(self.verbose - 3, 0),
use_ray=False,
)
time_used = time.time() - start_run_time
Expand Down Expand Up @@ -2077,7 +2089,7 @@ def _search(self):
logger.info(estimators)
if len(estimators) <= 1:
return
if self._state.task in ("binary", "multi"):
if self._state.task in CLASSIFICATION:
from sklearn.ensemble import StackingClassifier as Stacker
else:
from sklearn.ensemble import StackingRegressor as Stacker
Expand Down Expand Up @@ -2184,7 +2196,7 @@ def _select_estimator(self, estimator_list):
speed = delta_loss / delta_time
if speed:
estimated_cost = max(2 * gap / speed, estimated_cost)
estimated_cost == estimated_cost or 1e-10
estimated_cost == estimated_cost or 1e-9
inv.append(1 / estimated_cost)
else:
estimated_cost = self._eci[i]
Expand Down
11 changes: 5 additions & 6 deletions flaml/data.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
"""!
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
"""

import numpy as np
from scipy.sparse import vstack, issparse
import pandas as pd

from .training_log import training_log_reader

from datetime import datetime

CLASSIFICATION = ("binary", "multi", "classification")


def load_openml_dataset(
dataset_id, data_dir=None, random_state=0, dataset_format="dataframe"
Expand Down Expand Up @@ -300,11 +303,7 @@ def fit_transform(self, X, y, task):
)
self._drop = drop

if task in (
"binary",
"multi",
"classification",
) or not pd.api.types.is_numeric_dtype(y):
if task in CLASSIFICATION or not pd.api.types.is_numeric_dtype(y):
from sklearn.preprocessing import LabelEncoder

self.label_transformer = LabelEncoder()
Expand Down
4 changes: 2 additions & 2 deletions flaml/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
ARIMA,
SARIMAX,
)
from .data import group_counts
from .data import CLASSIFICATION, group_counts

import logging

Expand Down Expand Up @@ -301,7 +301,7 @@ def evaluate_model_CV(
valid_fold_num = total_fold_num = 0
n = kf.get_n_splits()
X_train_split, y_train_split = X_train_all, y_train_all
if task in ("binary", "multi"):
if task in CLASSIFICATION:
labels = np.unique(y_train_all)
else:
labels = None
Expand Down
Loading

0 comments on commit f48ca26

Please sign in to comment.