model_history, ITER_HP, settings in AutoML(), checkpoint bug fix (#283)

if save_best_model_per_estimator is False and retrain_final is True, unfit the model after evaluation in HPO. retrain if using ray. update ITER_HP in config after a trial is finished. change prophet logging level. example and notebook update. allow settings to be passed to AutoML constructor. Are you planning to add multi-output-regression capability to FLAML #192 Is multi-tasking allowed? #277 can pass the auotml setting to the constructor instead of requiring a derived class. remove model_history. checkpoint bug fix. * model_history meaning save_best_model_per_estimator * ITER_HP * example update * prophet logging level * comment update in forecast notebook * print format improvement * allow settings to be passed to AutoML constructor * checkpoint bug fix * time limit for autohf regression test * skip slow test on macos * cleanup before del
2026-01-23 10:28:22 -05:00 · 2021-11-18 09:39:45 -08:00
parent e9551de3cc
commit 72caa2172d
22 changed files with 476 additions and 320 deletions
--- a/README.md
+++ b/README.md
@@ -127,7 +127,7 @@ print(automl.model.estimator)
 * A basic time series forecasting example.

 ```python
-# pip install flaml[ts_forecast]
+# pip install "flaml[ts_forecast]"
 import numpy as np
 from flaml import AutoML
 X_train = np.arange('2014-01', '2021-01', dtype='datetime64[M]')
@@ -148,7 +148,6 @@ print(automl.predict(X_train[72:]))
 from sklearn.datasets import fetch_openml
 from flaml import AutoML
 X_train, y_train = fetch_openml(name="credit-g", return_X_y=True, as_frame=False)
-y_train = y_train.cat.codes
 # not a real learning to rank dataaset
 groups = [200] * 4 + [100] * 2    # group counts
 automl = AutoML()
@@ -161,31 +160,28 @@ automl.fit(
 * Fine tuning language model

 ```python
+# pip install "flaml[nlp]"
 from flaml import AutoML
 from datasets import load_dataset

 train_dataset = load_dataset("glue", "mrpc", split="train").to_pandas()
 dev_dataset = load_dataset("glue", "mrpc", split="validation").to_pandas()
 test_dataset = load_dataset("glue", "mrpc", split="test").to_pandas()
-
 custom_sent_keys = ["sentence1", "sentence2"]
 label_key = "label"
-
 X_train, y_train = train_dataset[custom_sent_keys], train_dataset[label_key]
 X_val, y_val = dev_dataset[custom_sent_keys], dev_dataset[label_key]
 X_test = test_dataset[custom_sent_keys]

 automl = AutoML()
 automl_settings = {
-    "max_iter": 3,
    "time_budget": 100,
-    "model_history": True,
-    "task": "seq-classification"
-}
-automl_settings["custom_hpo_args"] = {
-    "output_dir": "data/output/",
+    "task": "seq-classification",
+    "custom_hpo_args": {"output_dir": "data/output/"},
+    "gpu_per_trial": 1,  # set to 0 if no GPU is available
 }
 automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
+automl.predict(X_test)
 ```

 More examples can be found in [notebooks](https://github.com/microsoft/FLAML/tree/main/notebook/).
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -3,6 +3,7 @@
 #  * Licensed under the MIT License. See LICENSE file in the
 #  * project root for license information.
 import time
+import os
 from typing import Callable, Optional
 from functools import partial
 import numpy as np
@@ -102,15 +103,14 @@ class SearchState:
        self.total_time_used = 0
        self.total_iter = 0
        self.base_eci = None
-        self.time_best_found = 0
+        self.time_best_found = self.time_best_found_old = 0
        self.time2eval_best = 0
        self.time2eval_best_old = 0
        self.trained_estimator = None
        self.sample_size = None
        self.trial_time = 0

-    def update(self, result, time_used, save_model_history=False):
-
+    def update(self, result, time_used):
        if result:
            config = result["config"]
            if config and "FLAML_sample_size" in config:
@@ -129,7 +129,6 @@ class SearchState:
            )
            if n_iter:
                config[trained_estimator.ITER_HP] = n_iter
-
        else:
            obj, time2eval, trained_estimator = np.inf, 0.0, None
            metric_for_logging = config = None
@@ -155,7 +154,6 @@ class SearchState:
                self.trained_estimator
                and trained_estimator
                and self.trained_estimator != trained_estimator
-                and not save_model_history
            ):
                self.trained_estimator.cleanup()
            if trained_estimator:
@@ -262,6 +260,8 @@ class AutoMLState:
            self.log_training_metric,
            self.fit_kwargs,
        )
+        if self.retrain_final and not self.model_history:
+            trained_estimator.cleanup()

        if _is_nlp_task(self.task):
            del self.fit_kwargs["X_val"]
@@ -272,7 +272,7 @@ class AutoMLState:
            "wall_clock_time": time.time() - self._start_time_flag,
            "metric_for_logging": metric_for_logging,
            "val_loss": val_loss,
-            "trained_estimator": trained_estimator if self.save_model_history else None,
+            "trained_estimator": trained_estimator,
        }
        if sampled_weight is not None:
            self.fit_kwargs["sample_weight"] = weight
@@ -386,7 +386,7 @@ class AutoML:
                "time_budget": 60,
                "metric": 'accuracy',
                "task": 'classification',
-                "log_file_name": 'test/mylog.log',
+                "log_file_name": 'mylog.log',
            }
            automl.fit(X_train = X_train, y_train = y_train,
                **automl_settings)
@@ -395,17 +395,173 @@ class AutoML:

    from .version import __version__

-    def __init__(self):
+    def __init__(self, **settings):
+        """Constructor.
+
+        Many settings in fit() can be passed to the constructor too.
+        If an argument in fit() is provided, it will override the setting passed to the constructor.
+        If an argument in fit() is not provided but provided in the constructor, the value passed to the constructor will be used.
+
+        Args:
+            metric: A string of the metric name or a function,
+                e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
+                'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
+                'mape'. Default is 'auto'.
+                If passing a customized metric function, the function needs to
+                have the follwing signature:
+
+                .. code-block:: python
+
+                    def custom_metric(
+                        X_test, y_test, estimator, labels,
+                        X_train, y_train, weight_test=None, weight_train=None,
+                        config=None, groups_test=None, groups_train=None,
+                    ):
+                        return metric_to_minimize, metrics_to_log
+
+                which returns a float number as the minimization objective,
+                and a dictionary as the metrics to log.
+            task: A string of the task type, e.g.,
+                'classification', 'regression', 'ts_forecast', 'rank',
+                'seq-classification', 'seq-regression'.
+            n_jobs: An integer of the number of threads for training.
+            gpu_per_trial: A float of the number of gpus per trial, only used by TransformersEstimator.
+            log_file_name: A string of the log file name. To disable logging,
+                set it to be an empty string "".
+            estimator_list: A list of strings for estimator names, or 'auto'
+                e.g.,
+
+                .. code-block:: python
+
+                    ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree']
+
+            time_budget: A float number of the time budget in seconds.
+                Use -1 if no time limit.
+            max_iter: An integer of the maximal number of iterations.
+            sample: A boolean of whether to sample the training data during
+                search.
+            ensemble: boolean or dict | default=False. Whether to perform
+                ensemble after search. Can be a dict with keys 'passthrough'
+                and 'final_estimator' to specify the passthrough and
+                final_estimator in the stacker.
+            eval_method: A string of resampling strategy, one of
+                ['auto', 'cv', 'holdout'].
+            split_ratio: A float of the valiation data percentage for holdout.
+            n_splits: An integer of the number of folds for cross - validation.
+            log_type: A string of the log type, one of
+                ['better', 'all'].
+                'better' only logs configs with better loss than previos iters
+                'all' logs all the tried configs.
+            model_history: A boolean of whether to keep the best
+                model per estimator. Make sure memory is large enough if setting to True.
+            log_training_metric: A boolean of whether to log the training
+                metric for each model.
+            mem_thres: A float of the memory size constraint in bytes.
+            pred_time_limit: A float of the prediction latency constraint in seconds.
+            train_time_limit: A float of the training time constraint in seconds.
+            verbose: int, default=3 | Controls the verbosity, higher means more
+                messages.
+            retrain_full: bool or str, default=True | whether to retrain the
+                selected model on the full training data when using holdout.
+                True - retrain only after search finishes; False - no retraining;
+                'budget' - do best effort to retrain without violating the time
+                budget.
+            split_type: str, default="auto" | the data split type.
+                For classification tasks, valid choices are [
+                    "auto", 'stratified', 'uniform', 'time']. "auto" -> stratified.
+                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
+                    "auto" -> uniform.
+                For ts_forecast tasks, must be "auto" or 'time'.
+                For ranking task, must be "auto" or 'group'.
+            hpo_method: str, default="auto" | The hyperparameter
+                optimization method. By default, CFO is used for sequential
+                search and BlendSearch is used for parallel search.
+                No need to set when using flaml's default search space or using
+                a simple customized search space. When set to 'bs', BlendSearch
+                is used. BlendSearch can be tried when the search space is
+                complex, for example, containing multiple disjoint, discontinuous
+                subspaces. When set to 'random', random search is used.
+            starting_points: A dictionary to specify the starting hyperparameter
+                config for the estimators.
+                Keys are the name of the estimators, and values are the starting
+                hyperparamter configurations for the corresponding estimators.
+                The value can be a single hyperparamter configuration dict or a list
+                of hyperparamter configuration dicts.
+                In the following code example, we get starting_points from the
+                automl_experiment and use them in the new_automl_experiment.
+                e.g.,
+
+                .. code-block:: python
+
+                    from flaml import AutoML
+                    automl_experiment = AutoML()
+                    X_train, y_train = load_iris(return_X_y=True)
+                    automl_experiment.fit(X_train, y_train)
+                    starting_points = automl_experiment.best_config_per_estimator
+
+                    new_automl_experiment = AutoML()
+                    new_automl_experiment.fit(X_train, y_train,
+                        starting_points=starting_points)
+
+            seed: int or None, default=None | The random seed for np.random.
+            n_concurrent_trials: [Experimental] int, default=1 | The number of
+                concurrent trials. For n_concurrent_trials > 1, installation of
+                ray is required: `pip install flaml[ray]`.
+            keep_search_state: boolean, default=False | Whether to keep search
+                state after fit(). By default the state is deleted for space
+                saving.
+            early_stop: boolean, default=False | Whether to stop early if the
+                search is considered to converge.
+            append_log: boolean, default=False | Whetehr to directly append the log
+                records to the input log file if it exists.
+            auto_augment: boolean, default=True | Whether to automatically
+                augment rare classes.
+            min_sample_size: int, default=MIN_SAMPLE_TRAIN | the minimal sample
+                size when sample=True.
+            use_ray: boolean, default=False | Whether to use ray to run the training
+                in separate processes. This can be used to prevent OOM for large
+                datasets, but will incur more overhead in time. Only use it if
+                you run into OOM failures.
+
+        """
        self._track_iter = 0
        self._state = AutoMLState()
        self._state.learner_classes = {}
-
-    @property
-    def model_history(self):
-        """A dictionary of iter->model, storing the models when
-        the best model is updated each time.
-        """
-        return self._model_history
+        self._settings = settings
+        settings["time_budget"] = settings.get("time_budget", 60)
+        settings["task"] = settings.get("task", "classification")
+        settings["n_jobs"] = settings.get("n_jobs", -1)
+        settings["gpu_per_trial"] = settings.get("gpu_per_trial", 0)
+        settings["eval_method"] = settings.get("eval_method", "auto")
+        settings["split_ratio"] = settings.get("split_ratio", SPLIT_RATIO)
+        settings["n_splits"] = settings.get("n_splits", N_SPLITS)
+        settings["auto_augment"] = settings.get("auto_augment", True)
+        settings["metric"] = settings.get("metric", "auto")
+        settings["estimator_list"] = settings.get("estimator_list", "auto")
+        settings["log_file_name"] = settings.get("log_file_name", "")
+        settings["max_iter"] = settings.get("max_iter", 1000000)
+        settings["sample"] = settings.get("sample", True)
+        settings["ensemble"] = settings.get("ensemble", False)
+        settings["log_type"] = settings.get("log_type", "better")
+        settings["model_history"] = settings.get(
+            "model_history", False
+        )
+        settings["log_training_metric"] = settings.get("log_training_metric", False)
+        settings["mem_thres"] = settings.get("mem_thres", MEM_THRES)
+        settings["pred_time_limit"] = settings.get("pred_time_limit", np.inf)
+        settings["train_time_limit"] = settings.get("train_time_limit", np.inf)
+        settings["verbose"] = settings.get("verbose", 3)
+        settings["retrain_full"] = settings.get("retrain_full", True)
+        settings["split_type"] = settings.get("split_type", "auto")
+        settings["hpo_method"] = settings.get("hpo_method", "auto")
+        settings["learner_selector"] = settings.get("learner_selector", "sample")
+        settings["starting_points"] = settings.get("starting_points", {})
+        settings["n_concurrent_trials"] = settings.get("n_concurrent_trials", 1)
+        settings["keep_search_state"] = settings.get("keep_search_state", False)
+        settings["early_stop"] = settings.get("early_stop", False)
+        settings["append_log"] = settings.get("append_log", False)
+        settings["min_sample_size"] = settings.get("min_sample_size", MIN_SAMPLE_TRAIN)
+        settings["use_ray"] = settings.get("use_ray", False)

    @property
    def config_history(self):
@@ -1022,11 +1178,11 @@ class AutoML:
        y_train=None,
        dataframe=None,
        label=None,
-        time_budget=0,
-        task="classification",
-        eval_method="auto",
-        split_ratio=SPLIT_RATIO,
-        n_splits=N_SPLITS,
+        time_budget=np.inf,
+        task=None,
+        eval_method=None,
+        split_ratio=None,
+        n_splits=None,
        split_type=None,
        groups=None,
        n_jobs=-1,
@@ -1034,7 +1190,7 @@ class AutoML:
        train_best=True,
        train_full=False,
        record_id=-1,
-        auto_augment=True,
+        auto_augment=None,
        **fit_kwargs,
    ):
        """Retrain from log file.
@@ -1059,18 +1215,19 @@ class AutoML:
                If not, dataframe and label must be provided.
            time_budget: A float number of the time budget in seconds.
            task: A string of the task type, e.g.,
-                'classification', 'regression', 'ts_forecast', 'rank'.
+                'classification', 'regression', 'ts_forecast', 'rank',
+                'seq-classification', 'seq-regression'.
            eval_method: A string of resampling strategy, one of
                ['auto', 'cv', 'holdout'].
            split_ratio: A float of the validation data percentage for holdout.
            n_splits: An integer of the number of folds for cross-validation.
-            split_type: str or None, default=None | the data split type.
+            split_type: str, default="auto" | the data split type.
                For classification tasks, valid choices are [
-                    None, 'stratified', 'uniform', 'time', 'group']. None -> stratified.
-                For regression tasks, valid choices are [None, 'uniform', 'time'].
-                    None -> uniform.
-                For ts_forecast tasks, must be None or 'time'.
-                For ranking task, must be None or 'group'.
+                    "auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
+                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
+                    "auto" -> uniform.
+                For ts_forecast tasks, must be "auto" or 'time'.
+                For ranking task, must be "auto" or 'group'.
            groups: None or array-like | Group labels (with matching length to
                y_train) or groups counts (with sum equal to length of y_train)
                for training data.
@@ -1090,11 +1247,15 @@ class AutoML:
            **fit_kwargs: Other key word arguments to pass to fit() function of
                the searched learners, such as sample_weight.
        """
-        if task == FORECAST:
-            self._state.task = TS_FORECAST
-        else:
-            self._state.task = task
-
+        task = task or self._settings.get("task")
+        eval_method = eval_method or self._settings.get("eval_method")
+        split_ratio = split_ratio or self._settings.get("split_ratio")
+        n_splits = n_splits or self._settings.get("n_splits")
+        split_type = split_type or self._settings.get("split_type")
+        auto_augment = (
+            self._settings.get("auto_augment") if auto_augment is None else auto_augment
+        )
+        self._state.task = TS_FORECAST if task == FORECAST else task
        self._state.fit_kwargs = fit_kwargs
        self._validate_data(X_train, y_train, dataframe, label, groups=groups)

@@ -1182,15 +1343,17 @@ class AutoML:
                len(np.unique(self._y_train_all))
            )
        if self._state.task in CLASSIFICATION:
-            assert split_type in [None, "stratified", "uniform", "time", "group"]
+            assert split_type in ["auto", "stratified", "uniform", "time", "group"]
            self._split_type = (
-                split_type or self._state.groups is None and "stratified" or "group"
+                split_type
+                if split_type != "auto"
+                else self._state.groups is None and "stratified" or "group"
            )
        elif self._state.task in REGRESSION:
-            assert split_type in [None, "uniform", "time", "group"]
-            self._split_type = split_type or "uniform"
+            assert split_type in ["auto", "uniform", "time", "group"]
+            self._split_type = split_type if split_type != "auto" else "uniform"
        elif self._state.task == TS_FORECAST:
-            assert split_type in [None, "time"]
+            assert split_type in ["auto", "time"]
            self._split_type = "time"
            assert isinstance(
                self._state.fit_kwargs.get("period"), int
@@ -1199,7 +1362,7 @@ class AutoML:
            assert (
                self._state.groups is not None
            ), "groups must be specified for ranking task."
-            assert split_type in [None, "group"]
+            assert split_type in ["auto", "group"]
            self._split_type = "group"

    def _decide_eval_method(self, time_budget):
@@ -1410,44 +1573,44 @@ class AutoML:
        y_train=None,
        dataframe=None,
        label=None,
-        metric="auto",
-        task="classification",
-        n_jobs=-1,
+        metric=None,
+        task=None,
+        n_jobs=None,
        gpu_per_trial=0,
-        log_file_name="flaml.log",
-        estimator_list="auto",
-        time_budget=60,
-        max_iter=1000000,
-        sample=True,
-        ensemble=False,
-        eval_method="auto",
-        log_type="better",
-        model_history=False,
-        split_ratio=SPLIT_RATIO,
-        n_splits=N_SPLITS,
-        log_training_metric=False,
-        mem_thres=MEM_THRES,
-        pred_time_limit=np.inf,
-        train_time_limit=np.inf,
+        log_file_name=None,
+        estimator_list=None,
+        time_budget=None,
+        max_iter=None,
+        sample=None,
+        ensemble=None,
+        eval_method=None,
+        log_type=None,
+        model_history=None,
+        split_ratio=None,
+        n_splits=None,
+        log_training_metric=None,
+        mem_thres=None,
+        pred_time_limit=None,
+        train_time_limit=None,
        X_val=None,
        y_val=None,
        sample_weight_val=None,
        groups_val=None,
        groups=None,
-        verbose=3,
-        retrain_full=True,
+        verbose=None,
+        retrain_full=None,
        split_type=None,
-        learner_selector="sample",
+        learner_selector=None,
        hpo_method=None,
-        starting_points={},
+        starting_points=None,
        seed=None,
-        n_concurrent_trials=1,
-        keep_search_state=False,
-        early_stop=False,
-        append_log=False,
-        auto_augment=True,
-        min_sample_size=MIN_SAMPLE_TRAIN,
-        use_ray=False,
+        n_concurrent_trials=None,
+        keep_search_state=None,
+        early_stop=None,
+        append_log=None,
+        auto_augment=None,
+        min_sample_size=None,
+        use_ray=None,
        **fit_kwargs,
    ):
        """Find a model for a given task.
@@ -1470,7 +1633,7 @@ class AutoML:
            metric: A string of the metric name or a function,
                e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
                'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
-                'mape'.
+                'mape'. Default is 'auto'.
                If passing a customized metric function, the function needs to
                have the follwing signature:

@@ -1486,10 +1649,12 @@ class AutoML:
                which returns a float number as the minimization objective,
                and a dictionary as the metrics to log.
            task: A string of the task type, e.g.,
-                'classification', 'regression', 'ts_forecast', 'rank'.
+                'classification', 'regression', 'ts_forecast', 'rank',
+                'seq-classification', 'seq-regression'.
            n_jobs: An integer of the number of threads for training.
            gpu_per_trial: A float of the number of gpus per trial, only used by TransformersEstimator.
-            log_file_name: A string of the log file name.
+            log_file_name: A string of the log file name. To disable logging,
+                set it to be an empty string "".
            estimator_list: A list of strings for estimator names, or 'auto'
                e.g.,

@@ -1498,6 +1663,7 @@ class AutoML:
                    ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree']

            time_budget: A float number of the time budget in seconds.
+                Use -1 if no time limit.
            max_iter: An integer of the maximal number of iterations.
            sample: A boolean of whether to sample the training data during
                search.
@@ -1513,9 +1679,8 @@ class AutoML:
                ['better', 'all'].
                'better' only logs configs with better loss than previos iters
                'all' logs all the tried configs.
-            model_history: A boolean of whether to keep the history of best
-                models in the history property. Make sure memory is large
-                enough if setting to True.
+            model_history: A boolean of whether to keep the best
+                model per estimator. Make sure memory is large enough if setting to True.
            log_training_metric: A boolean of whether to log the training
                metric for each model.
            mem_thres: A float of the memory size constraint in bytes.
@@ -1538,14 +1703,14 @@ class AutoML:
                True - retrain only after search finishes; False - no retraining;
                'budget' - do best effort to retrain without violating the time
                budget.
-            split_type: str or None, default=None | the data split type.
+            split_type: str, default="auto" | the data split type.
                For classification tasks, valid choices are [
-                    None, 'stratified', 'uniform', 'time']. None -> stratified.
-                For regression tasks, valid choices are [None, 'uniform', 'time'].
-                    None -> uniform.
-                For ts_forecast tasks, must be None or 'time'.
-                For ranking task, must be None or 'group'.
-            hpo_method: str or None, default=None | The hyperparameter
+                    "auto", 'stratified', 'uniform', 'time']. "auto" -> stratified.
+                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
+                    "auto" -> uniform.
+                For ts_forecast tasks, must be "auto" or 'time'.
+                For ranking task, must be "auto" or 'group'.
+            hpo_method: str, default="auto" | The hyperparameter
                optimization method. By default, CFO is used for sequential
                search and BlendSearch is used for parallel search.
                No need to set when using flaml's default search space or using
@@ -1600,10 +1765,74 @@ class AutoML:
        """

        self._state._start_time_flag = self._start_time_flag = time.time()
-        if task == FORECAST:
-            self._state.task = TS_FORECAST
-        else:
-            self._state.task = task
+        task = task or self._settings.get("task")
+        time_budget = time_budget or self._settings.get("time_budget")
+        n_jobs = n_jobs or self._settings.get("n_jobs")
+        gpu_per_trial = (
+            self._settings.get("gpu_per_trial")
+            if gpu_per_trial is None
+            else gpu_per_trial
+        )
+        eval_method = eval_method or self._settings.get("eval_method")
+        split_ratio = split_ratio or self._settings.get("split_ratio")
+        n_splits = n_splits or self._settings.get("n_splits")
+        auto_augment = (
+            self._settings.get("auto_augment") if auto_augment is None else auto_augment
+        )
+        metric = metric or self._settings.get("metric")
+        estimator_list = estimator_list or self._settings.get("estimator_list")
+        log_file_name = (
+            self._settings.get("log_file_name")
+            if log_file_name is None
+            else log_file_name
+        )
+        max_iter = self._settings.get("max_iter") if max_iter is None else max_iter
+        sample = self._settings.get("sample") if sample is None else sample
+        ensemble = self._settings.get("ensemble") if ensemble is None else ensemble
+        log_type = log_type or self._settings.get("log_type")
+        model_history = (
+            self._settings.get("model_history")
+            if model_history is None
+            else model_history
+        )
+        log_training_metric = (
+            self._settings.get("log_training_metric")
+            if log_training_metric is None
+            else log_training_metric
+        )
+        mem_thres = mem_thres or self._settings.get("mem_thres")
+        pred_time_limit = pred_time_limit or self._settings.get("pred_time_limit")
+        train_time_limit = train_time_limit or self._settings.get("train_time_limit")
+        verbose = self._settings.get("verbose") if verbose is None else verbose
+        retrain_full = (
+            self._settings.get("retrain_full") if retrain_full is None else retrain_full
+        )
+        split_type = split_type or self._settings.get("split_type")
+        hpo_method = hpo_method or self._settings.get("hpo_method")
+        learner_selector = learner_selector or self._settings.get("learner_selector")
+        starting_points = (
+            self._settings.get("starting_points")
+            if starting_points is None
+            else starting_points
+        )
+        n_concurrent_trials = n_concurrent_trials or self._settings.get(
+            "n_concurrent_trials"
+        )
+        keep_search_state = (
+            self._settings.get("keep_search_state")
+            if keep_search_state is None
+            else keep_search_state
+        )
+        early_stop = (
+            self._settings.get("early_stop") if early_stop is None else early_stop
+        )
+        append_log = (
+            self._settings.get("append_log") if append_log is None else append_log
+        )
+        min_sample_size = min_sample_size or self._settings.get("min_sample_size")
+        use_ray = self._settings.get("use_ray") if use_ray is None else use_ray
+
+        self._state.task = TS_FORECAST if task == FORECAST else task
        self._state.log_training_metric = log_training_metric

        self._state.fit_kwargs = fit_kwargs
@@ -1634,13 +1863,24 @@ class AutoML:
        self._state.eval_method = eval_method
        logger.info("Evaluation method: {}".format(eval_method))

+        self._state.n_jobs = n_jobs
+        self._n_concurrent_trials = n_concurrent_trials
+        self._early_stop = early_stop
+        self._use_ray = use_ray or n_concurrent_trials > 1
+        # use the following condition if we have an estimation of average_trial_time and average_trial_overhead
+        # self._use_ray = use_ray or n_concurrent_trials > ( average_trail_time + average_trial_overhead) / (average_trial_time)
+        self._state.resources_per_trial = (
+            {"cpu": int(os.cpu_count() / n_concurrent_trials), "gpu": gpu_per_trial}
+            if n_jobs < 0
+            else {"cpu": n_jobs, "gpu": gpu_per_trial}
+        )
        self._retrain_in_budget = retrain_full == "budget" and (
            eval_method == "holdout" and self._state.X_val is None
        )
-        self._retrain_final = (
+        self._state.retrain_final = (
            retrain_full is True
            and eval_method == "holdout"
-            and self._state.X_val is None
+            and (self._state.X_val is None or self._use_ray)
            or eval_method == "cv"
            and (max_iter > 0 or retrain_full is True)
            or max_iter == 1
@@ -1728,7 +1968,7 @@ class AutoML:
            )
        logger.info("List of ML learners in AutoML Run: {}".format(estimator_list))
        self.estimator_list = estimator_list
-        self._state.time_budget = time_budget or 1e10
+        self._state.time_budget = time_budget if time_budget > 0 else 1e10
        self._active_estimators = estimator_list.copy()
        self._ensemble = ensemble
        self._max_iter = max_iter
@@ -1737,24 +1977,15 @@ class AutoML:
        self._state.train_time_limit = train_time_limit
        self._log_type = log_type
        self.split_ratio = split_ratio
-        self._state.save_model_history = model_history
-        self._state.n_jobs = n_jobs
-        import os
-
-        self._state.resources_per_trial = (
-            {"cpu": int(os.cpu_count() / n_concurrent_trials), "gpu": gpu_per_trial}
-            if self._state.n_jobs < 0
-            else {"cpu": self._state.n_jobs, "gpu": gpu_per_trial}
-        )
-        self._n_concurrent_trials = n_concurrent_trials
-        self._early_stop = early_stop
-        self._use_ray = use_ray or n_concurrent_trials > 1
-        # use the following condition if we have an estimation of average_trial_time and average_trial_overhead
-        # self._use_ray = use_ray or n_concurrent_trials > ( average_trail_time + average_trial_overhead) / (average_trial_time)
-        self._hpo_method = hpo_method or (
-            "bs"
-            if n_concurrent_trials > 1 or self._use_ray and len(estimator_list) > 1
-            else "cfo"
+        self._state.model_history = model_history
+        self._hpo_method = (
+            hpo_method
+            if hpo_method != "auto"
+            else (
+                "bs"
+                if n_concurrent_trials > 1 or self._use_ray and len(estimator_list) > 1
+                else "cfo"
+            )
        )
        if log_file_name:
            with training_log_writer(log_file_name, append_log) as save_helper:
@@ -1770,7 +2001,7 @@ class AutoML:
            )
            if (
                self._hpo_method in ("cfo", "bs")
-                and (self._time_taken_best_iter >= time_budget * 0.7)
+                and (self._time_taken_best_iter >= self._state.time_budget * 0.7)
                and not all(
                    state.search_alg and state.search_alg.searcher.is_ls_ever_converged
                    for state in self._search_states.values()
@@ -1780,7 +2011,7 @@ class AutoML:
                    "Time taken to find the best model is {0:.0f}% of the "
                    "provided time budget and not all estimators' hyperparameter "
                    "search converged. Consider increasing the time budget.".format(
-                        self._time_taken_best_iter / time_budget * 100
+                        self._time_taken_best_iter / self._state.time_budget * 100
                    )
                )

@@ -1791,9 +2022,6 @@ class AutoML:
            del self._state.y_train, self._state.y_train_all, self._state.y_val
            del self._sample_weight_full, self._state.fit_kwargs
            del self._state.groups, self._state.groups_all, self._state.groups_val
-            for state in self._search_states.values():
-                if state.trained_estimator:
-                    del state.trained_estimator
        # if verbose == 0:
        logger.setLevel(old_level)

@@ -1819,7 +2047,7 @@ class AutoML:
        else:
            raise NotImplementedError(
                f"hpo_method={self._hpo_method} is not recognized. "
-                "'cfo' and 'bs' are supported."
+                "'auto', 'cfo' and 'bs' are supported."
            )
        space = self.search_space
        if self._hpo_method == "random":
@@ -1890,7 +2118,7 @@ class AutoML:
                config = result["config"]
                estimator = config.get("ml", config)["learner"]
                search_state = self._search_states[estimator]
-                search_state.update(result, 0, self._state.save_model_history)
+                search_state.update(result, 0)
                if result["wall_clock_time"] is not None:
                    self._state.time_from_start = result["wall_clock_time"]
                if search_state.sample_size == self._state.data_size:
@@ -1905,10 +2133,6 @@ class AutoML:
                        config,
                        self._time_taken_best_iter,
                    )
-                    if self._state.save_model_history:
-                        self._model_history[
-                            _track_iter
-                        ] = search_state.trained_estimator
                    self._trained_estimator = search_state.trained_estimator
                    self._best_iteration = _track_iter
                    self._time_taken_best_iter = self._state.time_from_start
@@ -1961,7 +2185,7 @@ class AutoML:
        better = True  # whether we find a better model in one trial
        if self._ensemble:
            self.best_model = {}
-        if self._max_iter < 2 and self.estimator_list and self._retrain_final:
+        if self._max_iter < 2 and self.estimator_list and self._state.retrain_final:
            # when max_iter is 1, no need to search
            # TODO: otherwise, need to make sure SearchStates.init_config is inside search space
            self._max_iter = 0
@@ -2077,11 +2301,7 @@ class AutoML:
            better = False
            if analysis.trials:
                result = analysis.trials[-1].last_result
-                search_state.update(
-                    result,
-                    time_used=time_used,
-                    save_model_history=self._state.save_model_history,
-                )
+                search_state.update(result, time_used=time_used)
                if self._estimator_index is None:
                    # update init eci estimate
                    eci_base = search_state.init_eci
@@ -2123,27 +2343,22 @@ class AutoML:
                        search_state.best_config,
                        self._state.time_from_start,
                    )
-                    if self._state.save_model_history:
-                        self._model_history[
-                            self._track_iter
-                        ] = search_state.trained_estimator
-                    elif self._trained_estimator:
+                    if self._trained_estimator:
+                        self._trained_estimator.cleanup()
                        del self._trained_estimator
                        self._trained_estimator = None
-                    if not self._retrain_final:
+                    if not self._state.retrain_final:
                        self._trained_estimator = search_state.trained_estimator
                    self._best_iteration = self._track_iter
                    self._time_taken_best_iter = self._state.time_from_start
                    better = True
                    next_trial_time = search_state.time2eval_best
-                if search_state.trained_estimator and not (
-                    self._state.save_model_history or self._ensemble
+                if (
+                    search_state.trained_estimator
+                    and not self._state.model_history
+                    and search_state.trained_estimator != self._trained_estimator
                ):
-                    # free RAM
-                    if search_state.trained_estimator != self._trained_estimator:
-                        search_state.trained_estimator.cleanup()
-                    del search_state.trained_estimator
-                    search_state.trained_estimator = None
+                    search_state.trained_estimator.cleanup()
                if better or self._log_type == "all":
                    if self._training_log:
                        self._training_log.append(
@@ -2260,7 +2475,6 @@ class AutoML:
        self._estimator_index = None
        self._best_iteration = 0
        self._time_taken_best_iter = 0
-        self._model_history = {}
        self._config_history = {}
        self._max_iter_per_learner = 1000000  # TODO
        self._iter_per_learner = dict([(e, 0) for e in self.estimator_list])
@@ -2371,7 +2585,7 @@ class AutoML:
                        self._trained_estimator.model = stacker
                    else:
                        raise e
-            elif self._retrain_final:
+            elif self._state.retrain_final:
                # reset time budget for retraining
                if self._max_iter > 1:
                    self._state.time_from_start -= self._state.time_budget
--- a/flaml/model.py
+++ b/flaml/model.py
@@ -227,7 +227,8 @@ class BaseEstimator:
        return self._model.predict_proba(X_test)

    def cleanup(self):
-        pass
+        del self._model
+        self._model = None

    @classmethod
    def search_space(cls, **params):
@@ -282,7 +283,7 @@ class BaseEstimator:
 class TransformersEstimator(BaseEstimator):
    """The class for fine-tuning language models, using huggingface transformers API."""

-    ITER_HP = "final_global_step"
+    ITER_HP = "global_max_steps"

    def __init__(self, task="seq-classification", **config):
        super().__init__(task, **config)
@@ -301,7 +302,7 @@ class TransformersEstimator(BaseEstimator):
                "domain": tune.loguniform(lower=1e-6, upper=1e-3),
            },
            "num_train_epochs": {
-                "domain": tune.loguniform(lower=0.5, upper=10.0),
+                "domain": tune.loguniform(lower=0.1, upper=10.0),
            },
            "per_device_train_batch_size": {
                "domain": tune.choice([4, 8, 16, 32]),
@@ -316,7 +317,7 @@ class TransformersEstimator(BaseEstimator):
                "domain": tune.loguniform(lower=1e-8, upper=1e-6),
            },
            "seed": {"domain": tune.choice(list(range(40, 45)))},
-            "final_global_step": {"domain": sys.maxsize},
+            "global_max_steps": {"domain": sys.maxsize},
        }

    def _init_hpo_args(self, automl_fit_kwargs: dict = None):
@@ -356,18 +357,27 @@ class TransformersEstimator(BaseEstimator):
            def on_step_end(self, args, state, control, **callback_kwargs):
                if state.global_step == 1:
                    self.time_per_iter = time.time() - self.step_begin_time
-                if budget:
-                    if (
+                if (
+                    budget
+                    and (
                        time.time() + self.time_per_iter
                        > self.train_begin_time + budget
-                    ):
-                        control.should_training_stop = True
-                        control.should_save = True
-                        control.should_evaluate = True
-                if state.global_step >= this_params[TransformersEstimator.ITER_HP]:
+                    )
+                    or state.global_step >= this_params[TransformersEstimator.ITER_HP]
+                ):
                    control.should_training_stop = True
+                    control.should_save = True
+                    control.should_evaluate = True
                return control

+            def on_epoch_end(self, args, state, control, **callback_kwargs):
+                if (
+                    control.should_training_stop
+                    or state.epoch + 1 >= this_params["num_train_epochs"]
+                ):
+                    control.should_save = True
+                    control.should_evaluate = True
+
        import transformers
        from transformers import TrainingArguments
        from transformers.trainer_utils import set_seed
@@ -467,36 +477,30 @@ class TransformersEstimator(BaseEstimator):

        trainer.train()

-        if eval_dataset is not None:
-            # if validation data is non empty, select the best checkpoint and save the final global step to self.params
-
-            self.params[self.ITER_HP] = trainer.state.global_step
-            if trainer.state.global_step > max(trainer.ckpt_to_global_step.values()):
-                trainer.evaluate()
-
-            self._checkpoint_path = self._select_checkpoint(
-                trainer.ckpt_to_metric, trainer.ckpt_to_global_step
-            )
-
-        else:
-            # if validation dataset is empty, save the last checkpoint
-            self._checkpoint_path = self._save_last_checkpoint(trainer)
+        self.params[self.ITER_HP] = trainer.state.global_step
+        self._checkpoint_path = self._select_checkpoint(trainer)

        self._kwargs = kwargs
        self._num_labels = num_labels
        self._per_model_config = per_model_config

-    def _save_last_checkpoint(self, trainer):
-        this_ckpt = trainer.save_state()
-        self.params[self.ITER_HP] = trainer.state.global_step
-        return this_ckpt
+    def _select_checkpoint(self, trainer):
+        if trainer.ckpt_to_metric:
+            best_ckpt, _ = min(
+                trainer.ckpt_to_metric.items(), key=lambda x: x[1][self._metric_name]
+            )
+            best_ckpt_global_step = trainer.ckpt_to_global_step[best_ckpt]
+        else:
+            best_ckpt_global_step = trainer.state.global_step
+            from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

-    def _select_checkpoint(self, ckpt_to_score, ckpt_to_global_step):
-        best_ckpt, best_score = min(
-            ckpt_to_score.items(), key=lambda x: x[1][self._metric_name]
-        )
-        best_ckpt_global_step = ckpt_to_global_step[best_ckpt]
+            best_ckpt = os.path.join(
+                trainer.args.output_dir,
+                f"{PREFIX_CHECKPOINT_DIR}-{best_ckpt_global_step}",
+            )
        self.params[self.ITER_HP] = best_ckpt_global_step
+        print(trainer.state.global_step)
+        print(trainer.ckpt_to_global_step)
        return best_ckpt

    def _compute_metrics_by_dataset_name(self, eval_pred):
@@ -1339,6 +1343,7 @@ class Prophet(SKLearnEstimator):
        cols = list(train_df)
        cols.remove(TS_TIMESTAMP_COL)
        cols.remove(TS_VALUE_COL)
+        logging.getLogger("prophet").setLevel(logging.WARNING)
        model = Prophet(**self.params)
        for regressor in cols:
            model.add_regressor(regressor)
@@ -1405,9 +1410,8 @@ class ARIMA(Prophet):
        current_time = time.time()
        train_df = self._join(X_train, y_train)
        train_df = self._preprocess(train_df)
-        cols = list(train_df)
-        cols.remove(TS_VALUE_COL)
-        regressors = cols
+        regressors = list(train_df)
+        regressors.remove(TS_VALUE_COL)
        if regressors:
            model = ARIMA_estimator(
                train_df[[TS_VALUE_COL]],
@@ -1434,14 +1438,12 @@ class ARIMA(Prophet):
            if isinstance(X_test, int):
                forecast = self._model.forecast(steps=X_test)
            elif isinstance(X_test, DataFrame):
-                first_col = X_test.pop(TS_TIMESTAMP_COL)
-                X_test.insert(0, TS_TIMESTAMP_COL, first_col)
-                start = X_test.iloc[0, 0]
-                end = X_test.iloc[-1, 0]
+                start = X_test[TS_TIMESTAMP_COL].iloc[0]
+                end = X_test[TS_TIMESTAMP_COL].iloc[-1]
                if len(X_test.columns) > 1:
+                    X_test = self._preprocess(X_test.drop(columns=TS_TIMESTAMP_COL))
                    regressors = list(X_test)
-                    regressors.remove(TS_TIMESTAMP_COL)
-                    X_test = self._preprocess(X_test)
+                    print(start, end, X_test.shape)
                    forecast = self._model.predict(
                        start=start, end=end, exog=X_test[regressors]
                    )
--- a/flaml/nlp/README.md
+++ b/flaml/nlp/README.md
@@ -6,7 +6,6 @@ An example:

 ```python
 from flaml import AutoML
-
 import pandas as pd

 train_dataset = pd.read_csv("data/input/train.tsv", delimiter="\t", quoting=3)
--- a/flaml/nlp/huggingface/trainer.py
+++ b/flaml/nlp/huggingface/trainer.py
@@ -7,54 +7,25 @@ except ImportError:


 class TrainerForAuto(TFTrainer):
-    def evaluate(self, eval_dataset=None):
-        """
-        Overriding transformers.Trainer.evaluate by saving state with save_state
-
-        Args:
-            eval_dataset:
-                the dataset to be evaluated
-        """
-
-        if self.eval_dataset is not None:
-            eval_dataloader = self.get_eval_dataloader(self.eval_dataset)
-            output = self.prediction_loop(eval_dataloader, description="Evaluation")
-            self.log(output.metrics)
-
-            ckpt_dir = self.save_state()
-
-            for key in list(output.metrics.keys()):
-                if key.startswith("eval_"):
-                    output.metrics[key[5:]] = output.metrics.pop(key)
-
-            if hasattr(self, "ckpt_to_global_step"):
-                self.ckpt_to_metric[ckpt_dir] = output.metrics
-                self.ckpt_to_global_step[ckpt_dir] = self.state.global_step
-            else:
-                self.ckpt_to_global_step = {ckpt_dir: self.state.global_step}
-                self.ckpt_to_metric = {ckpt_dir: output.metrics}
-
-    def save_state(self):
-        """
-        Overriding transformers.Trainer.save_state. It is only through saving
-        the states can best_trial.get_best_checkpoint return a non-empty value.
-        """
-        import torch
+    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
+        """Overriding transformers.Trainer.evaluate by saving metrics and checkpoint path"""
        from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
-        from ray import tune

-        with tune.checkpoint_dir(step=self.state.global_step) as checkpoint_dir:
-            self.args.output_dir = checkpoint_dir
-            # This is the directory name that Huggingface requires.
-            output_dir = os.path.join(
-                self.args.output_dir,
-                f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}",
-            )
-            self.save_model(output_dir)
-            torch.save(
-                self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")
-            )
-            torch.save(
-                self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")
-            )
-            return output_dir
+        ckpt_dir = os.path.join(
+            self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+        )
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+        metrics = eval_dataset and super().evaluate(
+            eval_dataset, ignore_keys, metric_key_prefix
+        )
+        if metrics:
+            for key in list(metrics.keys()):
+                if key.startswith("eval_"):
+                    metrics[key[5:]] = metrics.pop(key)
+        if hasattr(self, "ckpt_to_global_step"):
+            self.ckpt_to_global_step[ckpt_dir] = self.state.global_step
+            if metrics:
+                self.ckpt_to_metric[ckpt_dir] = metrics
+        else:
+            self.ckpt_to_global_step = {ckpt_dir: self.state.global_step}
+            self.ckpt_to_metric = {ckpt_dir: metrics} if metrics else {}
--- a/flaml/version.py
+++ b/flaml/version.py
@@ -1 +1 @@
-__version__ = "0.7.2"
+__version__ = "0.8.0"
--- a/notebook/flaml_forecast.ipynb
+++ b/notebook/flaml_forecast.ipynb
@@ -119,9 +119,9 @@
   "source": [
    "'''The main flaml automl API'''\n",
    "automl.fit(dataframe=train_df,  # training data\n",
-    "           label='co2',  # For 'forecast' task, label should be a tuple of strings for timestamp and value columns\n",
-    "           **settings, \n",
-    "           period=time_horizon)  # key word argument 'period' must be included for forecast task)"
+    "           label='co2',  # label column\n",
+    "           period=time_horizon,  # key word argument 'period' must be included for forecast task)\n",
+    "           **settings)"
   ]
  },
  {
@@ -173,8 +173,8 @@
   "source": [
    "''' compute predictions of testing dataset '''\n",
    "flaml_y_pred = automl.predict(X_test)\n",
-    "print('Predicted labels', flaml_y_pred)\n",
-    "print('True labels', y_test)"
+    "print(f\"Predicted labels\\n{flaml_y_pred}\")\n",
+    "print(f\"True labels\\n{y_test}\")"
   ]
  },
  {
--- a/test/automl/test_classification.py
+++ b/test/automl/test_classification.py
@@ -216,7 +216,7 @@ class TestClassification(unittest.TestCase):
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("xgboost"))
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)

@@ -253,7 +253,7 @@ class TestClassification(unittest.TestCase):
            print(automl_experiment.predict(X_train))
            print(automl_experiment.model)
            print(automl_experiment.config_history)
-            print(automl_experiment.model_history)
+            print(automl_experiment.best_model_for_estimator("xgboost"))
            print(automl_experiment.best_iteration)
            print(automl_experiment.best_estimator)
        except ImportError:
@@ -286,7 +286,7 @@ class TestClassification(unittest.TestCase):
            print(automl_experiment.predict(X_train))
            print(automl_experiment.model)
            print(automl_experiment.config_history)
-            print(automl_experiment.model_history)
+            print(automl_experiment.best_model_for_estimator("large_lgbm"))
            print(automl_experiment.best_iteration)
            print(automl_experiment.best_estimator)
        except ImportError:
@@ -314,7 +314,7 @@ class TestClassification(unittest.TestCase):
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("lrl2"))
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)

--- a/test/automl/test_forecast.py
+++ b/test/automl/test_forecast.py
--- a/test/automl/test_multiclass.py
+++ b/test/automl/test_multiclass.py
@@ -198,7 +198,7 @@ class TestMultiClass(unittest.TestCase):
        print(automl_experiment.classes_)
        print(automl_experiment.model)
        print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("rf"))
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        automl_experiment = AutoML()
@@ -238,13 +238,13 @@ class TestMultiClass(unittest.TestCase):
        print(automl_experiment.predict(X_train)[:5])
        print(automl_experiment.model)
        print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("catboost"))
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        del automl_settings["metric"]
        del automl_settings["model_history"]
        del automl_settings["log_training_metric"]
-        automl_experiment = AutoML()
+        automl_experiment = AutoML(task="classification")
        duration = automl_experiment.retrain_from_log(
            log_file_name=automl_settings["log_file_name"],
            X_train=X_train,
@@ -333,7 +333,7 @@ class TestMultiClass(unittest.TestCase):
        print(automl_experiment.predict_proba(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("extra_tree"))
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)

@@ -343,7 +343,7 @@ class TestMultiClass(unittest.TestCase):
            learner_name="large_lgbm", learner_class=MyLargeLGBM
        )
        automl_settings = {
-            "time_budget": None,
+            "time_budget": -1,
            "task": "classification",
            "log_file_name": "test/classification_oom.log",
            "estimator_list": ["large_lgbm"],
--- a/test/automl/test_notebook_example.py
+++ b/test/automl/test_notebook_example.py
--- a/test/automl/test_python_log.py
+++ b/test/automl/test_python_log.py
--- a/test/automl/test_regression.py
+++ b/test/automl/test_regression.py
@@ -56,7 +56,7 @@ class TestRegression(unittest.TestCase):
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("xgboost"))
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        print(get_output_from_log(automl_settings["log_file_name"], 1))
@@ -77,28 +77,6 @@ class TestRegression(unittest.TestCase):
            time_budget=0,
        )

-    def test_sparse_matrix_classification(self):
-        automl_experiment = AutoML()
-        automl_settings = {
-            "time_budget": 2,
-            "metric": "auto",
-            "task": "classification",
-            "log_file_name": "test/sparse_classification.log",
-            "split_type": "uniform",
-            "n_jobs": 1,
-            "model_history": True,
-        }
-        X_train = scipy.sparse.random(1554, 21, dtype=int)
-        y_train = np.random.randint(3, size=1554)
-        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-        print(automl_experiment.classes_)
-        print(automl_experiment.predict_proba(X_train))
-        print(automl_experiment.model)
-        print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
-        print(automl_experiment.best_iteration)
-        print(automl_experiment.best_estimator)
-
    def test_sparse_matrix_regression(self):
        X_train = scipy.sparse.random(300, 900, density=0.0001)
        y_train = np.random.uniform(size=300)
@@ -127,7 +105,7 @@ class TestRegression(unittest.TestCase):
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("rf"))
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        print(automl_experiment.best_config)
@@ -151,7 +129,7 @@ class TestRegression(unittest.TestCase):
            print(automl_experiment.predict(X_train))
            print(automl_experiment.model)
            print(automl_experiment.config_history)
-            print(automl_experiment.model_history)
+            print(automl_experiment.best_model_for_estimator("xgboost"))
            print(automl_experiment.best_iteration)
            print(automl_experiment.best_estimator)
        except ImportError:
@@ -176,7 +154,7 @@ class TestRegression(unittest.TestCase):
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("rf"))
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)

@@ -209,7 +187,7 @@ class TestRegression(unittest.TestCase):
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("my_xgb2"))
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        print(automl_experiment.best_config)
--- a/test/automl/test_split.py
+++ b/test/automl/test_split.py
--- a/test/automl/test_training_log.py
+++ b/test/automl/test_training_log.py
@@ -30,7 +30,6 @@ class TestTrainingLog(unittest.TestCase):
                # "ensemble": True,
                "keep_search_state": True,
                "estimator_list": estimator_list,
-                "model_history": True,
            }
            X_train, y_train = fetch_california_housing(return_X_y=True)
            automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
@@ -85,7 +84,7 @@ class TestTrainingLog(unittest.TestCase):
                        count += 1
                    self.assertGreater(count, 0)

-            automl_settings["log_file_name"] = None
+            automl_settings["log_file_name"] = ""
            automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
            automl._selected.update(None, 0)
            automl = AutoML()
--- a/test/automl/test_xgboost2d.py
+++ b/test/automl/test_xgboost2d.py
@@ -2,7 +2,6 @@ import unittest

 from sklearn.datasets import fetch_openml
 from sklearn.model_selection import train_test_split
-import numpy as np
 from flaml.automl import AutoML
 from flaml.model import XGBoostSklearnEstimator
 from flaml import tune
--- a/test/automl/test_xgboost2d_sample_size.py
+++ b/test/automl/test_xgboost2d_sample_size.py
@@ -2,7 +2,6 @@ import unittest

 from sklearn.datasets import fetch_openml
 from sklearn.model_selection import train_test_split
-import numpy as np
 from flaml.automl import AutoML
 from flaml.model import XGBoostSklearnEstimator
 from flaml import tune
@@ -44,7 +43,6 @@ def _test_simple(method=None, size_ratio=1.0):
        # "metric": 'accuracy',
        "task": "classification",
        "log_file_name": f"test/xgboost2d_{dataset}_{method}_{final_size}.log",
-        # "model_history": True,
        # "log_training_metric": True,
        # "split_type": split_type,
        "n_jobs": 1,
--- a/test/nlp/test_autohf.py
+++ b/test/nlp/test_autohf.py
@@ -1,3 +1,8 @@
+import os
+import pytest
+
+
+@pytest.mark.skipif(os.name == "posix", reason="do not run on mac os")
 def test_hf_data():
    try:
        import ray
@@ -33,15 +38,15 @@ def test_hf_data():
    automl_settings = {
        "gpu_per_trial": 0,
        "max_iter": 3,
-        "time_budget": 20,
+        "time_budget": 5,
        "task": "seq-classification",
        "metric": "accuracy",
-        "model_history": True,
+        "log_file_name": "seqclass.log",
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "google/electra-small-discriminator",
-        "output_dir": "data/output/",
+        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 5,
        "fp16": False,
    }
@@ -51,7 +56,6 @@ def test_hf_data():
    )
    automl = AutoML()
    automl.retrain_from_log(
-        log_file_name="flaml.log",
        X_train=X_train,
        y_train=y_train,
        train_full=True,
@@ -71,10 +75,6 @@ def test_hf_data():


 def _test_custom_data():
-    try:
-        import ray
-    except ImportError:
-        return
    from flaml import AutoML

    import pandas as pd
--- a/test/nlp/test_autohf_classificationhead.py
+++ b/test/nlp/test_autohf_classificationhead.py
@@ -1,8 +1,4 @@
 def test_classification_head():
-    try:
-        import ray
-    except ImportError:
-        return
    from flaml import AutoML

    from datasets import load_dataset
@@ -24,15 +20,14 @@ def test_classification_head():
    automl_settings = {
        "gpu_per_trial": 0,
        "max_iter": 3,
-        "time_budget": 20,
+        "time_budget": 5,
        "task": "seq-classification",
        "metric": "accuracy",
-        "model_history": True,
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "google/electra-small-discriminator",
-        "output_dir": "data/output/",
+        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 5,
        "fp16": False,
    }
--- a/test/nlp/test_autohf_cv.py
+++ b/test/nlp/test_autohf_cv.py
@@ -1,8 +1,9 @@
+import os
+import pytest
+
+
+@pytest.mark.skipif(os.name == "posix", reason="do not run on mac os")
 def test_cv():
-    try:
-        import ray
-    except ImportError:
-        return
    from flaml import AutoML

    from datasets import load_dataset
@@ -22,16 +23,15 @@ def test_cv():
    automl_settings = {
        "gpu_per_trial": 0,
        "max_iter": 3,
-        "time_budget": 20,
+        "time_budget": 5,
        "task": "seq-classification",
        "metric": "accuracy",
        "n_splits": 3,
-        "model_history": True,
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "google/electra-small-discriminator",
-        "output_dir": "data/output/",
+        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 1,
        "fp16": False,
    }
--- a/test/nlp/test_autohf_loadargs.py
+++ b/test/nlp/test_autohf_loadargs.py
--- a/test/nlp/test_autohf_regression.py
+++ b/test/nlp/test_autohf_regression.py
@@ -1,17 +1,18 @@
+import os
+import pytest
+
+
+@pytest.mark.skipif(os.name == "posix", reason="do not run on mac os")
 def test_regression():
-    try:
-        import ray
-    except ImportError:
-        return
    from flaml import AutoML

    from datasets import load_dataset

    train_dataset = (
-        load_dataset("glue", "stsb", split="train[:1%]").to_pandas().iloc[0:4]
+        load_dataset("glue", "stsb", split="train[:1%]").to_pandas().iloc[:20]
    )
    dev_dataset = (
-        load_dataset("glue", "stsb", split="train[1%:2%]").to_pandas().iloc[0:4]
+        load_dataset("glue", "stsb", split="train[1%:2%]").to_pandas().iloc[:20]
    )

    custom_sent_keys = ["sentence1", "sentence2"]
@@ -27,16 +28,16 @@ def test_regression():

    automl_settings = {
        "gpu_per_trial": 0,
-        "max_iter": 3,
-        "time_budget": 20,
+        "max_iter": 2,
+        "time_budget": 5,
        "task": "seq-regression",
        "metric": "rmse",
-        "model_history": True,
+        "starting_points": {"transformer": {"num_train_epochs": 1}},
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "google/electra-small-discriminator",
-        "output_dir": "data/output/",
+        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 5,
        "fp16": False,
    }
@@ -44,3 +45,7 @@ def test_regression():
    automl.fit(
        X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
    )
+
+
+if __name__ == "main":
+    test_regression()