From 30e200985c171e6a4dd7606a064084aaee19cce6 Mon Sep 17 00:00:00 2001 From: Chi Wang Date: Sun, 13 Nov 2022 12:47:59 -0800 Subject: [PATCH] Fix issues related to zero-shot automl (#783) * skip in-search-space check for small max iter * resolve Pickle Transformer #730 * resolve default config unrecognized #784 * Change definition of init_config * copy points_to_evaluate * make test pass * check learner selector --- flaml/automl.py | 44 ++++++++++++++---------------- flaml/default/suggest.py | 17 ++++++++---- flaml/tune/searcher/blendsearch.py | 7 +++++ flaml/version.py | 2 +- test/default/test_defaults.py | 40 +++++++++++++++++++++------ test/nlp/test_default.py | 5 ++-- 6 files changed, 73 insertions(+), 42 deletions(-) diff --git a/flaml/automl.py b/flaml/automl.py index 71d7bd43b..2394660f3 100644 --- a/flaml/automl.py +++ b/flaml/automl.py @@ -122,7 +122,7 @@ class SearchState: ): self.init_eci = learner_class.cost_relative2lgbm() self._search_space_domain = {} - self.init_config = {} + self.init_config = None self.low_cost_partial_config = {} self.cat_hp_cost = {} self.data_size = data_size @@ -183,6 +183,8 @@ class SearchState: isinstance(starting_point, dict) and starting_point.get(name) is not None ): + if self.init_config is None: + self.init_config = {} self.init_config[name] = starting_point[name] elif ( not isinstance(starting_point, list) @@ -190,13 +192,15 @@ class SearchState: and self.valid_starting_point_one_dim( space["init_value"], space["domain"] ) - ): # If starting point is list, no need to check the validity of self.init_config w.r.t search space - self.init_config[name] = space[ - "init_value" - ] # If starting_point is list, no need to assign value to self.init_config here + ): + if self.init_config is None: + self.init_config = {} + self.init_config[name] = space["init_value"] if isinstance(starting_point, list): self.init_config = starting_point + else: + self.init_config = [] if self.init_config is None else [self.init_config] self._hp_names = list(self._search_space_domain.keys()) self.search_alg = None @@ -268,7 +272,7 @@ class SearchState: self.val_loss, self.config = obj, config def get_hist_config_sig(self, sample_size, config): - config_values = tuple([config[k] for k in self._hp_names]) + config_values = tuple([config[k] for k in self._hp_names if k in config]) config_sig = str(sample_size) + "_" + str(config_values) return config_sig @@ -1273,8 +1277,8 @@ class AutoML(BaseEstimator): ) if self._df: X_train_all.reset_index(drop=True, inplace=True) - if isinstance(y_train_all, pd.Series): - y_train_all.reset_index(drop=True, inplace=True) + if isinstance(y_train_all, pd.Series): + y_train_all.reset_index(drop=True, inplace=True) X_train, y_train = X_train_all, y_train_all self._state.groups_all = self._state.groups @@ -1987,10 +1991,7 @@ class AutoML(BaseEstimator): """ points = [] for estimator in self.estimator_list: - if isinstance(self._search_states[estimator].init_config, list): - configs = self._search_states[estimator].init_config - else: - configs = [self._search_states[estimator].init_config] + configs = self._search_states[estimator].init_config for config in configs: config["learner"] = estimator if len(self.estimator_list) > 1: @@ -2862,7 +2863,9 @@ class AutoML(BaseEstimator): "period" ), # NOTE: this is after kwargs is updated to fit_kwargs_by_estimator custom_hp=custom_hp and custom_hp.get(estimator_name), - max_iter=max_iter, + max_iter=max_iter / len(estimator_list) + if self._learner_selector == "roundrobin" + else max_iter, ) logger.info("List of ML learners in AutoML Run: {}".format(estimator_list)) self.estimator_list = estimator_list @@ -2994,6 +2997,7 @@ class AutoML(BaseEstimator): metric_constraints=self.metric_constraints, seed=self._seed, time_budget_s=time_left, + allow_empty_config=True, ) else: # if self._hpo_method is bo, sometimes the search space and the initial config dimension do not match @@ -3207,11 +3211,7 @@ class AutoML(BaseEstimator): self._max_iter_per_learner = len(points_to_evaluate) low_cost_partial_config = None else: - points_to_evaluate = ( - search_state.init_config - if isinstance(search_state.init_config, list) - else [search_state.init_config] - ) + points_to_evaluate = search_state.init_config.copy() low_cost_partial_config = search_state.low_cost_partial_config if self._hpo_method in ("bs", "cfo", "grid", "cfocat", "random"): @@ -3230,6 +3230,7 @@ class AutoML(BaseEstimator): ], metric_constraints=self.metric_constraints, seed=self._seed, + allow_empty_config=True, ) else: # if self._hpo_method is bo, sometimes the search space and the initial config dimension do not match @@ -3435,17 +3436,12 @@ class AutoML(BaseEstimator): self.modelcount = 0 if self._max_iter < 2 and self.estimator_list and self._state.retrain_final: # when max_iter is 1, no need to search - # TODO: otherwise, need to make sure SearchStates.init_config is inside search space self.modelcount = self._max_iter self._max_iter = 0 self._best_estimator = estimator = self.estimator_list[0] self._selected = state = self._search_states[estimator] state.best_config_sample_size = self._state.data_size[0] - state.best_config = ( - state.init_config - if isinstance(state.init_config, dict) - else state.init_config[0] - ) + state.best_config = state.init_config[0] if state.init_config else {} elif self._use_ray is False: self._search_sequential() else: diff --git a/flaml/default/suggest.py b/flaml/default/suggest.py index aa22f0e0a..fbb02b069 100644 --- a/flaml/default/suggest.py +++ b/flaml/default/suggest.py @@ -45,7 +45,7 @@ def meta_feature(task, X_train, y_train, meta_feature_names): def load_config_predictor(estimator_name, task, location=None): - key = f"{estimator_name}_{task}" + key = f"{location}/{estimator_name}/{task}" predictor = CONFIG_PREDICTORS.get(key) if predictor: return predictor @@ -172,6 +172,15 @@ def suggest_hyperparams(task, X, y, estimator_or_predictor, location=None): return hyperparams, estimator_class +class AutoMLTransformer: + def __init__(self, model, data_transformer): + self._model = model + self._dt = data_transformer + + def transform(self, X): + return self._model._preprocess(self._dt.transform(X)) + + def preprocess_and_suggest_hyperparams( task, X, @@ -251,9 +260,5 @@ def preprocess_and_suggest_hyperparams( X = model._preprocess(X) hyperparams = hyperparams and model.params - class AutoMLTransformer: - def transform(self, X): - return model._preprocess(dt.transform(X)) - - transformer = AutoMLTransformer() + transformer = AutoMLTransformer(model, dt) return hyperparams, estimator_class, X, y, transformer, dt.label_transformer diff --git a/flaml/tune/searcher/blendsearch.py b/flaml/tune/searcher/blendsearch.py index 54ea20b84..94480d401 100644 --- a/flaml/tune/searcher/blendsearch.py +++ b/flaml/tune/searcher/blendsearch.py @@ -65,6 +65,7 @@ class BlendSearch(Searcher): experimental: Optional[bool] = False, lexico_objectives: Optional[dict] = None, use_incumbent_result_in_evaluation=False, + allow_empty_config=False, ): """Constructor. @@ -255,6 +256,7 @@ class BlendSearch(Searcher): else: self._candidate_start_points = None self._time_budget_s, self._num_samples = time_budget_s, num_samples + self._allow_empty_config = allow_empty_config if space is not None: self._init_search() @@ -446,6 +448,8 @@ class BlendSearch(Searcher): for key, value in result.items(): if key.startswith("config/"): config[key[7:]] = value + if self._allow_empty_config and not config: + return signature = self._ls.config_signature( config, self._subspace.get(trial_id, {}) ) @@ -775,6 +779,9 @@ class BlendSearch(Searcher): reward = self._evaluated_rewards.pop(0) else: init_config = self._ls.init_config + if self._allow_empty_config and not init_config: + assert reward is None, "Empty config can't have reward." + return init_config config, space = self._ls.complete_config( init_config, self._ls_bound_min, self._ls_bound_max ) diff --git a/flaml/version.py b/flaml/version.py index 66c607f6d..b19b12ea3 100644 --- a/flaml/version.py +++ b/flaml/version.py @@ -1 +1 @@ -__version__ = "1.0.13" +__version__ = "1.0.14" diff --git a/test/default/test_defaults.py b/test/default/test_defaults.py index 57304d1c5..77408b695 100644 --- a/test/default/test_defaults.py +++ b/test/default/test_defaults.py @@ -1,4 +1,5 @@ import sys +import pickle from sklearn.datasets import load_iris, fetch_california_housing, load_breast_cancer from sklearn.model_selection import train_test_split import pandas as pd @@ -12,15 +13,6 @@ from flaml.default import ( ) -def test_build_portfolio(path="test/default", strategy="greedy"): - sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task binary --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split() - portfolio.main() - sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task multiclass --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split() - portfolio.main() - sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task regression --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split() - portfolio.main() - - def test_greedy_feedback(path="test/default", strategy="greedy-feedback"): # sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task binary --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split() # portfolio.main() @@ -30,6 +22,15 @@ def test_greedy_feedback(path="test/default", strategy="greedy-feedback"): portfolio.main() +def test_build_portfolio(path="test/default", strategy="greedy"): + sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task binary --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split() + portfolio.main() + sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task multiclass --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split() + portfolio.main() + sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task regression --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split() + portfolio.main() + + def test_iris(as_frame=True): automl = AutoML() automl_settings = { @@ -96,6 +97,8 @@ def test_suggest_classification(): ) = preprocess_and_suggest_hyperparams( "classification", X_train, y_train, "lgbm", location=location ) + with open("test/default/feature_transformer", "wb") as f: + pickle.dump(feature_transformer, f, pickle.HIGHEST_PROTOCOL) model = estimator_class(**hyperparams) # estimator_class is LGBMClassifier model.fit(X, y) X_test = feature_transformer.transform(X_test) @@ -218,5 +221,24 @@ def test_xgboost(): print(regressor) +def test_nobudget(): + X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True) + automl = AutoML() + automl.fit( + X_train[:20], + y_train[:20], + estimator_list=["lgbm", "extra_tree", "rf"], + max_iter=12, + starting_points="data", + log_file_name="test/default/no_budget.txt", + log_type="all", + ) + automl.fit(X_train[:20], y_train[:20], estimator_list=["lgbm", "extra_tree", "rf"]) + # make sure that zero-shot config out of the search space does not degnerate to low cost init config + assert automl.best_config_per_estimator["extra_tree"]["n_estimators"] > 4 + # make sure that the zero-shot config {} is not modified + assert "criterion" not in automl.best_config_per_estimator["rf"] + + if __name__ == "__main__": test_build_portfolio("flaml/default") diff --git a/test/nlp/test_default.py b/test/nlp/test_default.py index fcddcda45..b0796b334 100644 --- a/test/nlp/test_default.py +++ b/test/nlp/test_default.py @@ -36,7 +36,8 @@ def test_starting_point_not_in_search_space(): automl.fit(X_train, y_train, **automl_settings) assert ( - automl._search_states[this_estimator_name].init_config["learning_rate"] != 2e-3 + automl._search_states[this_estimator_name].init_config[0]["learning_rate"] + != 2e-3 ) """ @@ -67,7 +68,7 @@ def test_starting_point_not_in_search_space(): automl_settings["starting_points"] = "data:test/nlp/default/" automl.fit(X_train, y_train, **automl_settings) - assert len(automl._search_states[this_estimator_name].init_config) == len( + assert len(automl._search_states[this_estimator_name].init_config[0]) == len( automl._search_states[this_estimator_name]._search_space_domain ) - len(automl_settings["custom_hp"][this_estimator_name]), ( "The search space is updated with the custom_hp on {} hyperparameters of "