From 30e200985c171e6a4dd7606a064084aaee19cce6 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Sun, 13 Nov 2022 12:47:59 -0800
Subject: [PATCH] Fix issues related to zero-shot automl (#783)

* skip in-search-space check for small max iter

* resolve Pickle Transformer #730

* resolve default config unrecognized #784

* Change definition of init_config

* copy points_to_evaluate

* make test pass

* check learner selector
---
 flaml/automl.py                    | 44 ++++++++++++++----------------
 flaml/default/suggest.py           | 17 ++++++++----
 flaml/tune/searcher/blendsearch.py |  7 +++++
 flaml/version.py                   |  2 +-
 test/default/test_defaults.py      | 40 +++++++++++++++++++++------
 test/nlp/test_default.py           |  5 ++--
 6 files changed, 73 insertions(+), 42 deletions(-)

diff --git a/flaml/automl.py b/flaml/automl.py
index 71d7bd43b..2394660f3 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -122,7 +122,7 @@ class SearchState:
     ):
         self.init_eci = learner_class.cost_relative2lgbm()
         self._search_space_domain = {}
-        self.init_config = {}
+        self.init_config = None
         self.low_cost_partial_config = {}
         self.cat_hp_cost = {}
         self.data_size = data_size
@@ -183,6 +183,8 @@ class SearchState:
                 isinstance(starting_point, dict)
                 and starting_point.get(name) is not None
             ):
+                if self.init_config is None:
+                    self.init_config = {}
                 self.init_config[name] = starting_point[name]
             elif (
                 not isinstance(starting_point, list)
@@ -190,13 +192,15 @@ class SearchState:
                 and self.valid_starting_point_one_dim(
                     space["init_value"], space["domain"]
                 )
-            ):  # If starting point is list, no need to check the validity of self.init_config w.r.t search space
-                self.init_config[name] = space[
-                    "init_value"
-                ]  # If starting_point is list, no need to assign value to self.init_config here
+            ):
+                if self.init_config is None:
+                    self.init_config = {}
+                self.init_config[name] = space["init_value"]
 
         if isinstance(starting_point, list):
             self.init_config = starting_point
+        else:
+            self.init_config = [] if self.init_config is None else [self.init_config]
 
         self._hp_names = list(self._search_space_domain.keys())
         self.search_alg = None
@@ -268,7 +272,7 @@ class SearchState:
         self.val_loss, self.config = obj, config
 
     def get_hist_config_sig(self, sample_size, config):
-        config_values = tuple([config[k] for k in self._hp_names])
+        config_values = tuple([config[k] for k in self._hp_names if k in config])
         config_sig = str(sample_size) + "_" + str(config_values)
         return config_sig
 
@@ -1273,8 +1277,8 @@ class AutoML(BaseEstimator):
                 )
             if self._df:
                 X_train_all.reset_index(drop=True, inplace=True)
-                if isinstance(y_train_all, pd.Series):
-                    y_train_all.reset_index(drop=True, inplace=True)
+            if isinstance(y_train_all, pd.Series):
+                y_train_all.reset_index(drop=True, inplace=True)
 
         X_train, y_train = X_train_all, y_train_all
         self._state.groups_all = self._state.groups
@@ -1987,10 +1991,7 @@ class AutoML(BaseEstimator):
         """
         points = []
         for estimator in self.estimator_list:
-            if isinstance(self._search_states[estimator].init_config, list):
-                configs = self._search_states[estimator].init_config
-            else:
-                configs = [self._search_states[estimator].init_config]
+            configs = self._search_states[estimator].init_config
             for config in configs:
                 config["learner"] = estimator
                 if len(self.estimator_list) > 1:
@@ -2862,7 +2863,9 @@ class AutoML(BaseEstimator):
                     "period"
                 ),  # NOTE: this is after kwargs is updated to fit_kwargs_by_estimator
                 custom_hp=custom_hp and custom_hp.get(estimator_name),
-                max_iter=max_iter,
+                max_iter=max_iter / len(estimator_list)
+                if self._learner_selector == "roundrobin"
+                else max_iter,
             )
         logger.info("List of ML learners in AutoML Run: {}".format(estimator_list))
         self.estimator_list = estimator_list
@@ -2994,6 +2997,7 @@ class AutoML(BaseEstimator):
                 metric_constraints=self.metric_constraints,
                 seed=self._seed,
                 time_budget_s=time_left,
+                allow_empty_config=True,
             )
         else:
             # if self._hpo_method is bo, sometimes the search space and the initial config dimension do not match
@@ -3207,11 +3211,7 @@ class AutoML(BaseEstimator):
                     self._max_iter_per_learner = len(points_to_evaluate)
                     low_cost_partial_config = None
                 else:
-                    points_to_evaluate = (
-                        search_state.init_config
-                        if isinstance(search_state.init_config, list)
-                        else [search_state.init_config]
-                    )
+                    points_to_evaluate = search_state.init_config.copy()
 
                     low_cost_partial_config = search_state.low_cost_partial_config
                 if self._hpo_method in ("bs", "cfo", "grid", "cfocat", "random"):
@@ -3230,6 +3230,7 @@ class AutoML(BaseEstimator):
                         ],
                         metric_constraints=self.metric_constraints,
                         seed=self._seed,
+                        allow_empty_config=True,
                     )
                 else:
                     # if self._hpo_method is bo, sometimes the search space and the initial config dimension do not match
@@ -3435,17 +3436,12 @@ class AutoML(BaseEstimator):
         self.modelcount = 0
         if self._max_iter < 2 and self.estimator_list and self._state.retrain_final:
             # when max_iter is 1, no need to search
-            # TODO: otherwise, need to make sure SearchStates.init_config is inside search space
             self.modelcount = self._max_iter
             self._max_iter = 0
             self._best_estimator = estimator = self.estimator_list[0]
             self._selected = state = self._search_states[estimator]
             state.best_config_sample_size = self._state.data_size[0]
-            state.best_config = (
-                state.init_config
-                if isinstance(state.init_config, dict)
-                else state.init_config[0]
-            )
+            state.best_config = state.init_config[0] if state.init_config else {}
         elif self._use_ray is False:
             self._search_sequential()
         else:
diff --git a/flaml/default/suggest.py b/flaml/default/suggest.py
index aa22f0e0a..fbb02b069 100644
--- a/flaml/default/suggest.py
+++ b/flaml/default/suggest.py
@@ -45,7 +45,7 @@ def meta_feature(task, X_train, y_train, meta_feature_names):
 
 
 def load_config_predictor(estimator_name, task, location=None):
-    key = f"{estimator_name}_{task}"
+    key = f"{location}/{estimator_name}/{task}"
     predictor = CONFIG_PREDICTORS.get(key)
     if predictor:
         return predictor
@@ -172,6 +172,15 @@ def suggest_hyperparams(task, X, y, estimator_or_predictor, location=None):
     return hyperparams, estimator_class
 
 
+class AutoMLTransformer:
+    def __init__(self, model, data_transformer):
+        self._model = model
+        self._dt = data_transformer
+
+    def transform(self, X):
+        return self._model._preprocess(self._dt.transform(X))
+
+
 def preprocess_and_suggest_hyperparams(
     task,
     X,
@@ -251,9 +260,5 @@ def preprocess_and_suggest_hyperparams(
         X = model._preprocess(X)
         hyperparams = hyperparams and model.params
 
-        class AutoMLTransformer:
-            def transform(self, X):
-                return model._preprocess(dt.transform(X))
-
-        transformer = AutoMLTransformer()
+        transformer = AutoMLTransformer(model, dt)
         return hyperparams, estimator_class, X, y, transformer, dt.label_transformer
diff --git a/flaml/tune/searcher/blendsearch.py b/flaml/tune/searcher/blendsearch.py
index 54ea20b84..94480d401 100644
--- a/flaml/tune/searcher/blendsearch.py
+++ b/flaml/tune/searcher/blendsearch.py
@@ -65,6 +65,7 @@ class BlendSearch(Searcher):
         experimental: Optional[bool] = False,
         lexico_objectives: Optional[dict] = None,
         use_incumbent_result_in_evaluation=False,
+        allow_empty_config=False,
     ):
         """Constructor.
 
@@ -255,6 +256,7 @@ class BlendSearch(Searcher):
         else:
             self._candidate_start_points = None
         self._time_budget_s, self._num_samples = time_budget_s, num_samples
+        self._allow_empty_config = allow_empty_config
         if space is not None:
             self._init_search()
 
@@ -446,6 +448,8 @@ class BlendSearch(Searcher):
                 for key, value in result.items():
                     if key.startswith("config/"):
                         config[key[7:]] = value
+            if self._allow_empty_config and not config:
+                return
             signature = self._ls.config_signature(
                 config, self._subspace.get(trial_id, {})
             )
@@ -775,6 +779,9 @@ class BlendSearch(Searcher):
                     reward = self._evaluated_rewards.pop(0)
             else:
                 init_config = self._ls.init_config
+            if self._allow_empty_config and not init_config:
+                assert reward is None, "Empty config can't have reward."
+                return init_config
             config, space = self._ls.complete_config(
                 init_config, self._ls_bound_min, self._ls_bound_max
             )
diff --git a/flaml/version.py b/flaml/version.py
index 66c607f6d..b19b12ea3 100644
--- a/flaml/version.py
+++ b/flaml/version.py
@@ -1 +1 @@
-__version__ = "1.0.13"
+__version__ = "1.0.14"
diff --git a/test/default/test_defaults.py b/test/default/test_defaults.py
index 57304d1c5..77408b695 100644
--- a/test/default/test_defaults.py
+++ b/test/default/test_defaults.py
@@ -1,4 +1,5 @@
 import sys
+import pickle
 from sklearn.datasets import load_iris, fetch_california_housing, load_breast_cancer
 from sklearn.model_selection import train_test_split
 import pandas as pd
@@ -12,15 +13,6 @@ from flaml.default import (
 )
 
 
-def test_build_portfolio(path="test/default", strategy="greedy"):
-    sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task binary --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
-    portfolio.main()
-    sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task multiclass --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
-    portfolio.main()
-    sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task regression --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
-    portfolio.main()
-
-
 def test_greedy_feedback(path="test/default", strategy="greedy-feedback"):
     # sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task binary --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
     # portfolio.main()
@@ -30,6 +22,15 @@ def test_greedy_feedback(path="test/default", strategy="greedy-feedback"):
     portfolio.main()
 
 
+def test_build_portfolio(path="test/default", strategy="greedy"):
+    sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task binary --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
+    portfolio.main()
+    sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task multiclass --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
+    portfolio.main()
+    sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task regression --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
+    portfolio.main()
+
+
 def test_iris(as_frame=True):
     automl = AutoML()
     automl_settings = {
@@ -96,6 +97,8 @@ def test_suggest_classification():
     ) = preprocess_and_suggest_hyperparams(
         "classification", X_train, y_train, "lgbm", location=location
     )
+    with open("test/default/feature_transformer", "wb") as f:
+        pickle.dump(feature_transformer, f, pickle.HIGHEST_PROTOCOL)
     model = estimator_class(**hyperparams)  # estimator_class is LGBMClassifier
     model.fit(X, y)
     X_test = feature_transformer.transform(X_test)
@@ -218,5 +221,24 @@ def test_xgboost():
     print(regressor)
 
 
+def test_nobudget():
+    X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
+    automl = AutoML()
+    automl.fit(
+        X_train[:20],
+        y_train[:20],
+        estimator_list=["lgbm", "extra_tree", "rf"],
+        max_iter=12,
+        starting_points="data",
+        log_file_name="test/default/no_budget.txt",
+        log_type="all",
+    )
+    automl.fit(X_train[:20], y_train[:20], estimator_list=["lgbm", "extra_tree", "rf"])
+    # make sure that zero-shot config out of the search space does not degnerate to low cost init config
+    assert automl.best_config_per_estimator["extra_tree"]["n_estimators"] > 4
+    # make sure that the zero-shot config {} is not modified
+    assert "criterion" not in automl.best_config_per_estimator["rf"]
+
+
 if __name__ == "__main__":
     test_build_portfolio("flaml/default")
diff --git a/test/nlp/test_default.py b/test/nlp/test_default.py
index fcddcda45..b0796b334 100644
--- a/test/nlp/test_default.py
+++ b/test/nlp/test_default.py
@@ -36,7 +36,8 @@ def test_starting_point_not_in_search_space():
 
     automl.fit(X_train, y_train, **automl_settings)
     assert (
-        automl._search_states[this_estimator_name].init_config["learning_rate"] != 2e-3
+        automl._search_states[this_estimator_name].init_config[0]["learning_rate"]
+        != 2e-3
     )
 
     """
@@ -67,7 +68,7 @@ def test_starting_point_not_in_search_space():
     automl_settings["starting_points"] = "data:test/nlp/default/"
 
     automl.fit(X_train, y_train, **automl_settings)
-    assert len(automl._search_states[this_estimator_name].init_config) == len(
+    assert len(automl._search_states[this_estimator_name].init_config[0]) == len(
         automl._search_states[this_estimator_name]._search_space_domain
     ) - len(automl_settings["custom_hp"][this_estimator_name]), (
         "The search space is updated with the custom_hp on {} hyperparameters of "