From a99e939404caeda88f32724cc264841f2f5dcfca Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Mon, 27 Sep 2021 21:30:49 -0700
Subject: [PATCH] update config if n_estimators is modified (#225)

* update config if n_estimators is modified

* prediction as int

* handle the case n_estimators <= 0

* if trained and no budget to train more, return the trained model

* split_type=group for classification & regression
---
 flaml/automl.py       |  66 ++++++++++++++++---------
 flaml/ml.py           |   3 ++
 flaml/model.py        |  52 +++++++++++++-------
 flaml/training_log.py | 111 +++++++++++++++++++++---------------------
 flaml/version.py      |   2 +-
 test/test_split.py    |  37 +++++++++-----
 6 files changed, 163 insertions(+), 108 deletions(-)

diff --git a/flaml/automl.py b/flaml/automl.py
index b281cd7e6..d5441f23d 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -117,9 +117,17 @@ class SearchState:
             time2eval = result["time_total_s"]
             trained_estimator = result["trained_estimator"]
             del result["trained_estimator"]  # free up RAM
+            n_iter = trained_estimator and trained_estimator.params.get("n_estimators")
+            if (
+                n_iter is not None
+                and "n_estimators" in config
+                and n_iter >= self._search_space_domain["n_estimators"].lower
+            ):
+                config["n_estimators"] = n_iter
+                n_iter = None
         else:
             obj, time2eval, trained_estimator = np.inf, 0.0, None
-            metric_for_logging = config = None
+            metric_for_logging = config = n_iter = None
         self.trial_time = time2eval
         self.total_time_used += time_used
         self.total_iter += 1
@@ -147,8 +155,10 @@ class SearchState:
                 self.trained_estimator.cleanup()
             if trained_estimator:
                 self.trained_estimator = trained_estimator
+            self.best_n_iter = n_iter
         self.metric_for_logging = metric_for_logging
         self.val_loss, self.config = obj, config
+        self.n_iter = n_iter
 
     def get_hist_config_sig(self, sample_size, config):
         config_values = tuple([config[k] for k in self._hp_names])
@@ -251,7 +261,9 @@ class AutoMLState:
         #     tune.report(**result)
         return result
 
-    def _train_with_config(self, estimator, config_w_resource, sample_size=None):
+    def _train_with_config(
+        self, estimator, config_w_resource, sample_size=None, n_iter=None
+    ):
         if not sample_size:
             sample_size = config_w_resource.get(
                 "FLAML_sample_size", len(self.y_train_all)
@@ -288,6 +300,7 @@ class AutoMLState:
             self.n_jobs,
             self.learner_classes.get(estimator),
             budget,
+            n_iter,
             self.fit_kwargs,
         )
         if sampled_weight is not None:
@@ -444,7 +457,9 @@ class AutoML:
         if y_pred.ndim > 1 and isinstance(y_pred, np.ndarray):
             y_pred = y_pred.flatten()
         if self._label_transformer:
-            return self._label_transformer.inverse_transform(pd.Series(y_pred))
+            return self._label_transformer.inverse_transform(
+                pd.Series(y_pred.astype(int))
+            )
         else:
             return y_pred
 
@@ -606,7 +621,7 @@ class AutoML:
         if (
             self._state.task in ("binary", "multi")
             and self._state.fit_kwargs.get("sample_weight") is None
-            and self._split_type != "time"
+            and self._split_type not in ["time", "group"]
         ):
             # logger.info(f"label {pd.unique(y_train_all)}")
             label_set, counts = np.unique(y_train_all, return_counts=True)
@@ -695,12 +710,12 @@ class AutoML:
                             test_size=split_ratio,
                             shuffle=False,
                         )
-            elif self._state.task == "rank":
+            elif self._split_type == "group":
                 gss = GroupShuffleSplit(
                     n_splits=1, test_size=split_ratio, random_state=RANDOM_SEED
                 )
                 for train_idx, val_idx in gss.split(
-                    X_train_all, y_train_all, self._state.groups
+                    X_train_all, y_train_all, self._state.groups_all
                 ):
                     if self._df:
                         X_train = X_train_all.iloc[train_idx]
@@ -708,8 +723,8 @@ class AutoML:
                     else:
                         X_train, X_val = X_train_all[train_idx], X_train_all[val_idx]
                     y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
-                    self._state.groups = self._state.groups[train_idx]
-                    self._state.groups_val = self._state.groups[val_idx]
+                    self._state.groups = self._state.groups_all[train_idx]
+                    self._state.groups_val = self._state.groups_all[val_idx]
             elif self._state.task in ("binary", "multi"):
                 # for classification, make sure the labels are complete in both
                 # training and validation data
@@ -920,7 +935,7 @@ class AutoML:
             n_splits: An integer of the number of folds for cross-validation.
             split_type: str or None, default=None | the data split type.
                 For classification tasks, valid choices are [
-                    None, 'stratified', 'uniform', 'time']. None -> stratified.
+                    None, 'stratified', 'uniform', 'time', 'group']. None -> stratified.
                 For regression tasks, valid choices are [None, 'uniform', 'time'].
                     None -> uniform.
                 For time series forecasting, must be None or 'time'.
@@ -1007,7 +1022,7 @@ class AutoML:
         self._state.time_budget = None
         self._state.n_jobs = n_jobs
         self._trained_estimator = self._state._train_with_config(
-            best_estimator, best_config, sample_size
+            best_estimator, best_config, sample_size, best.n_iter
         )[0]
         logger.info("retrain from log succeeded")
         return training_duration
@@ -1018,10 +1033,12 @@ class AutoML:
                 len(np.unique(self._y_train_all))
             )
         if self._state.task in ("binary", "multi"):
-            assert split_type in [None, "stratified", "uniform", "time"]
-            self._split_type = split_type or "stratified"
+            assert split_type in [None, "stratified", "uniform", "time", "group"]
+            self._split_type = (
+                split_type or self._state.groups is None and "stratified" or "group"
+            )
         elif self._state.task == "regression":
-            assert split_type in [None, "uniform", "time"]
+            assert split_type in [None, "uniform", "time", "group"]
             self._split_type = split_type or "uniform"
         elif self._state.task == "forecast":
             assert split_type in [None, "time"]
@@ -1420,15 +1437,16 @@ class AutoML:
         self.verbose = verbose
         if verbose == 0:
             logger.setLevel(logging.WARNING)
-        self._decide_split_type(split_type)
-        if eval_method == "auto" or self._state.X_val is not None:
-            eval_method = self._decide_eval_method(time_budget)
-        self._state.eval_method = eval_method
         if (not mlflow or not mlflow.active_run()) and not logger.handlers:
             # Add the console handler.
             _ch = logging.StreamHandler()
             _ch.setFormatter(logger_formatter)
             logger.addHandler(_ch)
+        self._decide_split_type(split_type)
+        logger.info(f"Data split method: {self._split_type}")
+        if eval_method == "auto" or self._state.X_val is not None:
+            eval_method = self._decide_eval_method(time_budget)
+        self._state.eval_method = eval_method
         logger.info("Evaluation method: {}".format(eval_method))
 
         self._retrain_in_budget = retrain_full == "budget" and (
@@ -1697,10 +1715,9 @@ class AutoML:
                         self._state.time_from_start,
                         search_state.val_loss,
                         config,
-                        self._state.best_loss,
-                        search_state.best_config,
                         estimator,
                         search_state.sample_size,
+                        search_state.n_iter,
                     )
 
     def _search_sequential(self):
@@ -1909,10 +1926,9 @@ class AutoML:
                             self._state.time_from_start,
                             search_state.val_loss,
                             search_state.config,
-                            search_state.best_loss,
-                            search_state.best_config,
                             estimator,
                             search_state.sample_size,
+                            search_state.n_iter,
                         )
                     if mlflow is not None and mlflow.active_run():
                         with mlflow.start_run(nested=True):
@@ -1985,10 +2001,12 @@ class AutoML:
                     <= est_retrain_time + next_trial_time
                 )
             ):
+                state = self._search_states[self._best_estimator]
                 self._trained_estimator, retrain_time = self._state._train_with_config(
                     self._best_estimator,
-                    self._search_states[self._best_estimator].best_config,
+                    state.best_config,
                     self.data_size_full,
+                    state.best_n_iter,
                 )
                 logger.info(
                     "retrain {} for {:.1f}s".format(self._best_estimator, retrain_time)
@@ -2093,13 +2111,15 @@ class AutoML:
                     > self._selected.est_retrain_time(self.data_size_full)
                     and self._selected.best_config_sample_size == self._state.data_size
                 ):
+                    state = self._search_states[self._best_estimator]
                     (
                         self._trained_estimator,
                         retrain_time,
                     ) = self._state._train_with_config(
                         self._best_estimator,
-                        self._search_states[self._best_estimator].best_config,
+                        state.best_config,
                         self.data_size_full,
+                        state.best_n_iter,
                     )
                     logger.info(
                         "retrain {} for {:.1f}s".format(
diff --git a/flaml/ml.py b/flaml/ml.py
index 37206ce4f..19ff9c26c 100644
--- a/flaml/ml.py
+++ b/flaml/ml.py
@@ -465,11 +465,14 @@ def train_estimator(
     n_jobs=1,
     estimator_class=None,
     budget=None,
+    n_iter=None,
     fit_kwargs={},
 ):
     start_time = time.time()
     estimator_class = estimator_class or get_estimator_class(task, estimator_name)
     estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
+    if n_iter is not None:
+        estimator.params["n_estimators"] = n_iter
     if X_train is not None:
         train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
     else:
diff --git a/flaml/model.py b/flaml/model.py
index bf1092b27..cdce9812a 100644
--- a/flaml/model.py
+++ b/flaml/model.py
@@ -316,15 +316,18 @@ class LGBMEstimator(BaseEstimator):
     def fit(self, X_train, y_train, budget=None, **kwargs):
         start_time = time.time()
         n_iter = self.params["n_estimators"]
+        trained = False
         if (
-            not self._time_per_iter or abs(self._train_size - X_train.shape[0]) > 4
-        ) and budget is not None:
+            (not self._time_per_iter or abs(self._train_size - X_train.shape[0]) > 4)
+            and budget is not None
+            and n_iter > 1
+        ):
             self.params["n_estimators"] = 1
             self._t1 = self._fit(X_train, y_train, **kwargs)
             if self._t1 >= budget:
-                self.params["n_estimators"] = n_iter
+                # self.params["n_estimators"] = n_iter
                 return self._t1
-            self.params["n_estimators"] = 4
+            self.params["n_estimators"] = min(n_iter, 4)
             self._t2 = self._fit(X_train, y_train, **kwargs)
             self._time_per_iter = (
                 (self._t2 - self._t1) / (self.params["n_estimators"] - 1)
@@ -335,19 +338,24 @@ class LGBMEstimator(BaseEstimator):
             )
             self._train_size = X_train.shape[0]
             if self._t1 + self._t2 >= budget or n_iter == self.params["n_estimators"]:
-                self.params["n_estimators"] = n_iter
+                # self.params["n_estimators"] = n_iter
                 return time.time() - start_time
-        if budget is not None:
-            self.params["n_estimators"] = min(
+            trained = True
+        if budget is not None and n_iter > 1:
+            max_iter = min(
                 n_iter,
                 int(
                     (budget - time.time() + start_time - self._t1) / self._time_per_iter
                     + 1
                 ),
             )
+            if trained and max_iter <= self.params["n_estimators"]:
+                return time.time() - start_time
+            self.params["n_estimators"] = max_iter
         if self.params["n_estimators"] > 0:
             self._fit(X_train, y_train, **kwargs)
-        self.params["n_estimators"] = n_iter
+        else:
+            self.params["n_estimators"] = n_iter
         train_time = time.time() - start_time
         return train_time
 
@@ -787,10 +795,15 @@ class CatBoostEstimator(BaseEstimator):
             cat_features = []
         # from catboost import CatBoostError
         # try:
+        trained = False
         if (
-            not CatBoostEstimator._time_per_iter
-            or abs(CatBoostEstimator._train_size - len(y_train)) > 4
-        ) and budget:
+            (
+                not CatBoostEstimator._time_per_iter
+                or abs(CatBoostEstimator._train_size - len(y_train)) > 4
+            )
+            and budget
+            and n_iter > 4
+        ):
             # measure the time per iteration
             self.params["n_estimators"] = 1
             CatBoostEstimator._smallmodel = self.estimator_class(
@@ -801,11 +814,11 @@ class CatBoostEstimator(BaseEstimator):
             )
             CatBoostEstimator._t1 = time.time() - start_time
             if CatBoostEstimator._t1 >= budget:
-                self.params["n_estimators"] = n_iter
+                # self.params["n_estimators"] = n_iter
                 self._model = CatBoostEstimator._smallmodel
                 shutil.rmtree(train_dir, ignore_errors=True)
                 return CatBoostEstimator._t1
-            self.params["n_estimators"] = 4
+            self.params["n_estimators"] = min(n_iter, 4)
             CatBoostEstimator._smallmodel = self.estimator_class(
                 train_dir=train_dir, **self.params
             )
@@ -822,13 +835,14 @@ class CatBoostEstimator(BaseEstimator):
                 time.time() - start_time >= budget
                 or n_iter == self.params["n_estimators"]
             ):
-                self.params["n_estimators"] = n_iter
+                # self.params["n_estimators"] = n_iter
                 self._model = CatBoostEstimator._smallmodel
                 shutil.rmtree(train_dir, ignore_errors=True)
                 return time.time() - start_time
-        if budget:
+            trained = True
+        if budget and n_iter > 4:
             train_times = 1
-            self.params["n_estimators"] = min(
+            max_iter = min(
                 n_iter,
                 int(
                     (budget - time.time() + start_time - CatBoostEstimator._t1)
@@ -838,6 +852,9 @@ class CatBoostEstimator(BaseEstimator):
                 ),
             )
             self._model = CatBoostEstimator._smallmodel
+            if trained and max_iter <= self.params["n_estimators"]:
+                return time.time() - start_time
+            self.params["n_estimators"] = max_iter
         if self.params["n_estimators"] > 0:
             n = max(int(len(y_train) * 0.9), len(y_train) - 1000)
             X_tr, y_tr = X_train[:n], y_train[:n]
@@ -863,9 +880,10 @@ class CatBoostEstimator(BaseEstimator):
             if weight is not None:
                 kwargs["sample_weight"] = weight
             self._model = model
+        else:
+            self.params["n_estimators"] = n_iter
         # except CatBoostError:
         #     self._model = None
-        self.params["n_estimators"] = n_iter
         train_time = time.time() - start_time
         return train_time
 
diff --git a/flaml/training_log.py b/flaml/training_log.py
index b4061a9d8..923d60c80 100644
--- a/flaml/training_log.py
+++ b/flaml/training_log.py
@@ -1,7 +1,7 @@
-'''!
- * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
+"""!
+ * Copyright (c) Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License.
-'''
+"""
 
 import json
 from typing import IO
@@ -10,19 +10,19 @@ import warnings
 
 
 class TrainingLogRecord(object):
-
-    def __init__(self,
-                 record_id: int,
-                 iter_per_learner: int,
-                 logged_metric: float,
-                 trial_time: float,
-                 wall_clock_time: float,
-                 validation_loss,
-                 config,
-                 best_validation_loss,
-                 best_config,
-                 learner,
-                 sample_size):
+    def __init__(
+        self,
+        record_id: int,
+        iter_per_learner: int,
+        logged_metric: float,
+        trial_time: float,
+        wall_clock_time: float,
+        validation_loss: float,
+        config: dict,
+        learner: str,
+        sample_size: int,
+        n_iter: int,
+    ):
         self.record_id = record_id
         self.iter_per_learner = iter_per_learner
         self.logged_metric = logged_metric
@@ -30,10 +30,9 @@ class TrainingLogRecord(object):
         self.wall_clock_time = wall_clock_time
         self.validation_loss = validation_loss
         self.config = config
-        self.best_validation_loss = best_validation_loss
-        self.best_config = best_config
         self.learner = learner
         self.sample_size = sample_size
+        self.n_iter = n_iter  # n_estimators for catboost
 
     def dump(self, fp: IO[str]):
         d = vars(self)
@@ -49,75 +48,78 @@ class TrainingLogRecord(object):
 
 
 class TrainingLogCheckPoint(TrainingLogRecord):
-
     def __init__(self, curr_best_record_id: int):
         self.curr_best_record_id = curr_best_record_id
 
 
 class TrainingLogWriter(object):
-
     def __init__(self, output_filename: str):
         self.output_filename = output_filename
         self.file = None
         self.current_best_loss_record_id = None
-        self.current_best_loss = float('+inf')
+        self.current_best_loss = float("+inf")
         self.current_sample_size = None
         self.current_record_id = 0
 
     def open(self):
-        self.file = open(self.output_filename, 'w')
+        self.file = open(self.output_filename, "w")
 
     def append_open(self):
-        self.file = open(self.output_filename, 'a')
+        self.file = open(self.output_filename, "a")
 
-    def append(self,
-               it_counter: int,
-               train_loss: float,
-               trial_time: float,
-               wall_clock_time: float,
-               validation_loss,
-               config,
-               best_validation_loss,
-               best_config,
-               learner,
-               sample_size):
+    def append(
+        self,
+        it_counter: int,
+        train_loss: float,
+        trial_time: float,
+        wall_clock_time: float,
+        validation_loss,
+        config,
+        learner,
+        sample_size,
+        n_iter,
+    ):
         if self.file is None:
             raise IOError("Call open() to open the outpute file first.")
         if validation_loss is None:
-            raise ValueError('TEST LOSS NONE ERROR!!!')
-        record = TrainingLogRecord(self.current_record_id,
-                                   it_counter,
-                                   train_loss,
-                                   trial_time,
-                                   wall_clock_time,
-                                   validation_loss,
-                                   config,
-                                   best_validation_loss,
-                                   best_config,
-                                   learner,
-                                   sample_size)
-        if validation_loss < self.current_best_loss or \
-            validation_loss == self.current_best_loss and \
-                self.current_sample_size is not None and \
-                sample_size > self.current_sample_size:
+            raise ValueError("TEST LOSS NONE ERROR!!!")
+        record = TrainingLogRecord(
+            self.current_record_id,
+            it_counter,
+            train_loss,
+            trial_time,
+            wall_clock_time,
+            validation_loss,
+            config,
+            learner,
+            sample_size,
+            n_iter,
+        )
+        if (
+            validation_loss < self.current_best_loss
+            or validation_loss == self.current_best_loss
+            and self.current_sample_size is not None
+            and sample_size > self.current_sample_size
+        ):
             self.current_best_loss = validation_loss
             self.current_sample_size = sample_size
             self.current_best_loss_record_id = self.current_record_id
         self.current_record_id += 1
         record.dump(self.file)
-        self.file.write('\n')
+        self.file.write("\n")
         self.file.flush()
 
     def checkpoint(self):
         if self.file is None:
             raise IOError("Call open() to open the outpute file first.")
         if self.current_best_loss_record_id is None:
-            warnings.warn("checkpoint() called before any record is written, "
-                          "skipped.")
+            warnings.warn(
+                "checkpoint() called before any record is written, " "skipped."
+            )
             return
         record = TrainingLogCheckPoint(self.current_best_loss_record_id)
         record.dump(self.file)
-        self.file.write('\n')
+        self.file.write("\n")
         self.file.flush()
 
     def close(self):
@@ -127,7 +129,6 @@ class TrainingLogWriter(object):
 
 
 class TrainingLogReader(object):
-
     def __init__(self, filename: str):
         self.filename = filename
         self.file = None
diff --git a/flaml/version.py b/flaml/version.py
index 7bbb2ef5c..4c513f3b5 100644
--- a/flaml/version.py
+++ b/flaml/version.py
@@ -1 +1 @@
-__version__ = "0.6.5"
+__version__ = "0.6.6"
diff --git a/test/test_split.py b/test/test_split.py
index 8e97b3dac..579022d19 100644
--- a/test/test_split.py
+++ b/test/test_split.py
@@ -17,7 +17,7 @@ def _test(split_type):
     automl_settings = {
         "time_budget": 2,
         # "metric": 'accuracy',
-        "task": 'classification',
+        "task": "classification",
         "log_file_name": "test/{}.log".format(dataset),
         "model_history": True,
         "log_training_metric": True,
@@ -28,13 +28,16 @@ def _test(split_type):
         X, y = fetch_openml(name=dataset, return_X_y=True)
     except (ArffException, ValueError):
         from sklearn.datasets import load_wine
+
         X, y = load_wine(return_X_y=True)
-    if split_type != 'time':
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
-                                                            random_state=42)
+    if split_type != "time":
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.33, random_state=42
+        )
     else:
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
-                                                            shuffle=False)
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.33, shuffle=False
+        )
     automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
 
     pred = automl.predict(X_test)
@@ -53,36 +56,45 @@ def test_time():
 
 def test_groups():
     from sklearn.externals._arff import ArffException
+
     try:
         X, y = fetch_openml(name=dataset, return_X_y=True)
     except (ArffException, ValueError):
         from sklearn.datasets import load_wine
+
         X, y = load_wine(return_X_y=True)
 
     import numpy as np
+
     automl = AutoML()
     automl_settings = {
         "time_budget": 2,
-        "task": 'classification',
+        "task": "classification",
         "log_file_name": "test/{}.log".format(dataset),
         "model_history": True,
         "eval_method": "cv",
         "groups": np.random.randint(low=0, high=10, size=len(y)),
-        "estimator_list": ['lgbm', 'rf', 'xgboost', 'kneighbor'],  # list of ML learners
+        "estimator_list": ["lgbm", "rf", "xgboost", "kneighbor"],
         "learner_selector": "roundrobin",
     }
     automl.fit(X, y, **automl_settings)
 
+    automl_settings["eval_method"] = "holdout"
+    automl.fit(X, y, **automl_settings)
+
 
 def test_rank():
     from sklearn.externals._arff import ArffException
+
     try:
         X, y = fetch_openml(name=dataset, return_X_y=True)
     except (ArffException, ValueError):
         from sklearn.datasets import load_wine
+
         X, y = load_wine(return_X_y=True)
     y = y.cat.codes
     import numpy as np
+
     automl = AutoML()
     automl_settings = {
         "time_budget": 2,
@@ -90,8 +102,9 @@ def test_rank():
         "log_file_name": "test/{}.log".format(dataset),
         "model_history": True,
         "eval_method": "cv",
-        "groups": np.array(     # group labels
-            [0] * 200 + [1] * 200 + [2] * 200 + [3] * 200 + [4] * 100 + [5] * 100),
+        "groups": np.array(  # group labels
+            [0] * 200 + [1] * 200 + [2] * 200 + [3] * 200 + [4] * 100 + [5] * 100
+        ),
         "learner_selector": "roundrobin",
     }
     automl.fit(X, y, **automl_settings)
@@ -100,10 +113,10 @@ def test_rank():
     automl_settings = {
         "time_budget": 2,
         "task": "rank",
-        "metric": "ndcg@5",     # 5 can be replaced by any number
+        "metric": "ndcg@5",  # 5 can be replaced by any number
         "log_file_name": "test/{}.log".format(dataset),
         "model_history": True,
-        "groups": [200] * 4 + [100] * 2,    # alternative way: group counts
+        "groups": [200] * 4 + [100] * 2,  # alternative way: group counts
         # "estimator_list": ['lgbm', 'xgboost'],  # list of ML learners
         "learner_selector": "roundrobin",
     }