diff --git a/README.md b/README.md
index f33ab1ca8..e57f7f648 100644
--- a/README.md
+++ b/README.md
@@ -127,7 +127,7 @@ print(automl.model.estimator)
 * A basic time series forecasting example.
 
 ```python
-# pip install flaml[ts_forecast]
+# pip install "flaml[ts_forecast]"
 import numpy as np
 from flaml import AutoML
 X_train = np.arange('2014-01', '2021-01', dtype='datetime64[M]')
@@ -148,7 +148,6 @@ print(automl.predict(X_train[72:]))
 from sklearn.datasets import fetch_openml
 from flaml import AutoML
 X_train, y_train = fetch_openml(name="credit-g", return_X_y=True, as_frame=False)
-y_train = y_train.cat.codes
 # not a real learning to rank dataaset
 groups = [200] * 4 + [100] * 2    # group counts
 automl = AutoML()
@@ -161,31 +160,28 @@ automl.fit(
 * Fine tuning language model
 
 ```python
+# pip install "flaml[nlp]"
 from flaml import AutoML
 from datasets import load_dataset
 
 train_dataset = load_dataset("glue", "mrpc", split="train").to_pandas()
 dev_dataset = load_dataset("glue", "mrpc", split="validation").to_pandas()
 test_dataset = load_dataset("glue", "mrpc", split="test").to_pandas()
-
 custom_sent_keys = ["sentence1", "sentence2"]
 label_key = "label"
-
 X_train, y_train = train_dataset[custom_sent_keys], train_dataset[label_key]
 X_val, y_val = dev_dataset[custom_sent_keys], dev_dataset[label_key]
 X_test = test_dataset[custom_sent_keys]
 
 automl = AutoML()
 automl_settings = {
-    "max_iter": 3,
     "time_budget": 100,
-    "model_history": True,
-    "task": "seq-classification"
-}
-automl_settings["custom_hpo_args"] = {
-    "output_dir": "data/output/",
+    "task": "seq-classification",
+    "custom_hpo_args": {"output_dir": "data/output/"},
+    "gpu_per_trial": 1,  # set to 0 if no GPU is available
 }
 automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
+automl.predict(X_test)
 ```
 
 More examples can be found in [notebooks](https://github.com/microsoft/FLAML/tree/main/notebook/).
diff --git a/flaml/automl.py b/flaml/automl.py
index 3cf66b226..7a6ccc5d8 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -3,6 +3,7 @@
 #  * Licensed under the MIT License. See LICENSE file in the
 #  * project root for license information.
 import time
+import os
 from typing import Callable, Optional
 from functools import partial
 import numpy as np
@@ -102,15 +103,14 @@ class SearchState:
         self.total_time_used = 0
         self.total_iter = 0
         self.base_eci = None
-        self.time_best_found = 0
+        self.time_best_found = self.time_best_found_old = 0
         self.time2eval_best = 0
         self.time2eval_best_old = 0
         self.trained_estimator = None
         self.sample_size = None
         self.trial_time = 0
 
-    def update(self, result, time_used, save_model_history=False):
-
+    def update(self, result, time_used):
         if result:
             config = result["config"]
             if config and "FLAML_sample_size" in config:
@@ -129,7 +129,6 @@ class SearchState:
             )
             if n_iter:
                 config[trained_estimator.ITER_HP] = n_iter
-
         else:
             obj, time2eval, trained_estimator = np.inf, 0.0, None
             metric_for_logging = config = None
@@ -155,7 +154,6 @@ class SearchState:
                 self.trained_estimator
                 and trained_estimator
                 and self.trained_estimator != trained_estimator
-                and not save_model_history
             ):
                 self.trained_estimator.cleanup()
             if trained_estimator:
@@ -262,6 +260,8 @@ class AutoMLState:
             self.log_training_metric,
             self.fit_kwargs,
         )
+        if self.retrain_final and not self.model_history:
+            trained_estimator.cleanup()
 
         if _is_nlp_task(self.task):
             del self.fit_kwargs["X_val"]
@@ -272,7 +272,7 @@ class AutoMLState:
             "wall_clock_time": time.time() - self._start_time_flag,
             "metric_for_logging": metric_for_logging,
             "val_loss": val_loss,
-            "trained_estimator": trained_estimator if self.save_model_history else None,
+            "trained_estimator": trained_estimator,
         }
         if sampled_weight is not None:
             self.fit_kwargs["sample_weight"] = weight
@@ -386,7 +386,7 @@ class AutoML:
                 "time_budget": 60,
                 "metric": 'accuracy',
                 "task": 'classification',
-                "log_file_name": 'test/mylog.log',
+                "log_file_name": 'mylog.log',
             }
             automl.fit(X_train = X_train, y_train = y_train,
                 **automl_settings)
@@ -395,17 +395,173 @@ class AutoML:
 
     from .version import __version__
 
-    def __init__(self):
+    def __init__(self, **settings):
+        """Constructor.
+
+        Many settings in fit() can be passed to the constructor too.
+        If an argument in fit() is provided, it will override the setting passed to the constructor.
+        If an argument in fit() is not provided but provided in the constructor, the value passed to the constructor will be used.
+
+        Args:
+            metric: A string of the metric name or a function,
+                e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
+                'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
+                'mape'. Default is 'auto'.
+                If passing a customized metric function, the function needs to
+                have the follwing signature:
+
+                .. code-block:: python
+
+                    def custom_metric(
+                        X_test, y_test, estimator, labels,
+                        X_train, y_train, weight_test=None, weight_train=None,
+                        config=None, groups_test=None, groups_train=None,
+                    ):
+                        return metric_to_minimize, metrics_to_log
+
+                which returns a float number as the minimization objective,
+                and a dictionary as the metrics to log.
+            task: A string of the task type, e.g.,
+                'classification', 'regression', 'ts_forecast', 'rank',
+                'seq-classification', 'seq-regression'.
+            n_jobs: An integer of the number of threads for training.
+            gpu_per_trial: A float of the number of gpus per trial, only used by TransformersEstimator.
+            log_file_name: A string of the log file name. To disable logging,
+                set it to be an empty string "".
+            estimator_list: A list of strings for estimator names, or 'auto'
+                e.g.,
+
+                .. code-block:: python
+
+                    ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree']
+
+            time_budget: A float number of the time budget in seconds.
+                Use -1 if no time limit.
+            max_iter: An integer of the maximal number of iterations.
+            sample: A boolean of whether to sample the training data during
+                search.
+            ensemble: boolean or dict | default=False. Whether to perform
+                ensemble after search. Can be a dict with keys 'passthrough'
+                and 'final_estimator' to specify the passthrough and
+                final_estimator in the stacker.
+            eval_method: A string of resampling strategy, one of
+                ['auto', 'cv', 'holdout'].
+            split_ratio: A float of the valiation data percentage for holdout.
+            n_splits: An integer of the number of folds for cross - validation.
+            log_type: A string of the log type, one of
+                ['better', 'all'].
+                'better' only logs configs with better loss than previos iters
+                'all' logs all the tried configs.
+            model_history: A boolean of whether to keep the best
+                model per estimator. Make sure memory is large enough if setting to True.
+            log_training_metric: A boolean of whether to log the training
+                metric for each model.
+            mem_thres: A float of the memory size constraint in bytes.
+            pred_time_limit: A float of the prediction latency constraint in seconds.
+            train_time_limit: A float of the training time constraint in seconds.
+            verbose: int, default=3 | Controls the verbosity, higher means more
+                messages.
+            retrain_full: bool or str, default=True | whether to retrain the
+                selected model on the full training data when using holdout.
+                True - retrain only after search finishes; False - no retraining;
+                'budget' - do best effort to retrain without violating the time
+                budget.
+            split_type: str, default="auto" | the data split type.
+                For classification tasks, valid choices are [
+                    "auto", 'stratified', 'uniform', 'time']. "auto" -> stratified.
+                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
+                    "auto" -> uniform.
+                For ts_forecast tasks, must be "auto" or 'time'.
+                For ranking task, must be "auto" or 'group'.
+            hpo_method: str, default="auto" | The hyperparameter
+                optimization method. By default, CFO is used for sequential
+                search and BlendSearch is used for parallel search.
+                No need to set when using flaml's default search space or using
+                a simple customized search space. When set to 'bs', BlendSearch
+                is used. BlendSearch can be tried when the search space is
+                complex, for example, containing multiple disjoint, discontinuous
+                subspaces. When set to 'random', random search is used.
+            starting_points: A dictionary to specify the starting hyperparameter
+                config for the estimators.
+                Keys are the name of the estimators, and values are the starting
+                hyperparamter configurations for the corresponding estimators.
+                The value can be a single hyperparamter configuration dict or a list
+                of hyperparamter configuration dicts.
+                In the following code example, we get starting_points from the
+                automl_experiment and use them in the new_automl_experiment.
+                e.g.,
+
+                .. code-block:: python
+
+                    from flaml import AutoML
+                    automl_experiment = AutoML()
+                    X_train, y_train = load_iris(return_X_y=True)
+                    automl_experiment.fit(X_train, y_train)
+                    starting_points = automl_experiment.best_config_per_estimator
+
+                    new_automl_experiment = AutoML()
+                    new_automl_experiment.fit(X_train, y_train,
+                        starting_points=starting_points)
+
+            seed: int or None, default=None | The random seed for np.random.
+            n_concurrent_trials: [Experimental] int, default=1 | The number of
+                concurrent trials. For n_concurrent_trials > 1, installation of
+                ray is required: `pip install flaml[ray]`.
+            keep_search_state: boolean, default=False | Whether to keep search
+                state after fit(). By default the state is deleted for space
+                saving.
+            early_stop: boolean, default=False | Whether to stop early if the
+                search is considered to converge.
+            append_log: boolean, default=False | Whetehr to directly append the log
+                records to the input log file if it exists.
+            auto_augment: boolean, default=True | Whether to automatically
+                augment rare classes.
+            min_sample_size: int, default=MIN_SAMPLE_TRAIN | the minimal sample
+                size when sample=True.
+            use_ray: boolean, default=False | Whether to use ray to run the training
+                in separate processes. This can be used to prevent OOM for large
+                datasets, but will incur more overhead in time. Only use it if
+                you run into OOM failures.
+
+        """
         self._track_iter = 0
         self._state = AutoMLState()
         self._state.learner_classes = {}
-
-    @property
-    def model_history(self):
-        """A dictionary of iter->model, storing the models when
-        the best model is updated each time.
-        """
-        return self._model_history
+        self._settings = settings
+        settings["time_budget"] = settings.get("time_budget", 60)
+        settings["task"] = settings.get("task", "classification")
+        settings["n_jobs"] = settings.get("n_jobs", -1)
+        settings["gpu_per_trial"] = settings.get("gpu_per_trial", 0)
+        settings["eval_method"] = settings.get("eval_method", "auto")
+        settings["split_ratio"] = settings.get("split_ratio", SPLIT_RATIO)
+        settings["n_splits"] = settings.get("n_splits", N_SPLITS)
+        settings["auto_augment"] = settings.get("auto_augment", True)
+        settings["metric"] = settings.get("metric", "auto")
+        settings["estimator_list"] = settings.get("estimator_list", "auto")
+        settings["log_file_name"] = settings.get("log_file_name", "")
+        settings["max_iter"] = settings.get("max_iter", 1000000)
+        settings["sample"] = settings.get("sample", True)
+        settings["ensemble"] = settings.get("ensemble", False)
+        settings["log_type"] = settings.get("log_type", "better")
+        settings["model_history"] = settings.get(
+            "model_history", False
+        )
+        settings["log_training_metric"] = settings.get("log_training_metric", False)
+        settings["mem_thres"] = settings.get("mem_thres", MEM_THRES)
+        settings["pred_time_limit"] = settings.get("pred_time_limit", np.inf)
+        settings["train_time_limit"] = settings.get("train_time_limit", np.inf)
+        settings["verbose"] = settings.get("verbose", 3)
+        settings["retrain_full"] = settings.get("retrain_full", True)
+        settings["split_type"] = settings.get("split_type", "auto")
+        settings["hpo_method"] = settings.get("hpo_method", "auto")
+        settings["learner_selector"] = settings.get("learner_selector", "sample")
+        settings["starting_points"] = settings.get("starting_points", {})
+        settings["n_concurrent_trials"] = settings.get("n_concurrent_trials", 1)
+        settings["keep_search_state"] = settings.get("keep_search_state", False)
+        settings["early_stop"] = settings.get("early_stop", False)
+        settings["append_log"] = settings.get("append_log", False)
+        settings["min_sample_size"] = settings.get("min_sample_size", MIN_SAMPLE_TRAIN)
+        settings["use_ray"] = settings.get("use_ray", False)
 
     @property
     def config_history(self):
@@ -1022,11 +1178,11 @@ class AutoML:
         y_train=None,
         dataframe=None,
         label=None,
-        time_budget=0,
-        task="classification",
-        eval_method="auto",
-        split_ratio=SPLIT_RATIO,
-        n_splits=N_SPLITS,
+        time_budget=np.inf,
+        task=None,
+        eval_method=None,
+        split_ratio=None,
+        n_splits=None,
         split_type=None,
         groups=None,
         n_jobs=-1,
@@ -1034,7 +1190,7 @@ class AutoML:
         train_best=True,
         train_full=False,
         record_id=-1,
-        auto_augment=True,
+        auto_augment=None,
         **fit_kwargs,
     ):
         """Retrain from log file.
@@ -1059,18 +1215,19 @@ class AutoML:
                 If not, dataframe and label must be provided.
             time_budget: A float number of the time budget in seconds.
             task: A string of the task type, e.g.,
-                'classification', 'regression', 'ts_forecast', 'rank'.
+                'classification', 'regression', 'ts_forecast', 'rank',
+                'seq-classification', 'seq-regression'.
             eval_method: A string of resampling strategy, one of
                 ['auto', 'cv', 'holdout'].
             split_ratio: A float of the validation data percentage for holdout.
             n_splits: An integer of the number of folds for cross-validation.
-            split_type: str or None, default=None | the data split type.
+            split_type: str, default="auto" | the data split type.
                 For classification tasks, valid choices are [
-                    None, 'stratified', 'uniform', 'time', 'group']. None -> stratified.
-                For regression tasks, valid choices are [None, 'uniform', 'time'].
-                    None -> uniform.
-                For ts_forecast tasks, must be None or 'time'.
-                For ranking task, must be None or 'group'.
+                    "auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
+                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
+                    "auto" -> uniform.
+                For ts_forecast tasks, must be "auto" or 'time'.
+                For ranking task, must be "auto" or 'group'.
             groups: None or array-like | Group labels (with matching length to
                 y_train) or groups counts (with sum equal to length of y_train)
                 for training data.
@@ -1090,11 +1247,15 @@ class AutoML:
             **fit_kwargs: Other key word arguments to pass to fit() function of
                 the searched learners, such as sample_weight.
         """
-        if task == FORECAST:
-            self._state.task = TS_FORECAST
-        else:
-            self._state.task = task
-
+        task = task or self._settings.get("task")
+        eval_method = eval_method or self._settings.get("eval_method")
+        split_ratio = split_ratio or self._settings.get("split_ratio")
+        n_splits = n_splits or self._settings.get("n_splits")
+        split_type = split_type or self._settings.get("split_type")
+        auto_augment = (
+            self._settings.get("auto_augment") if auto_augment is None else auto_augment
+        )
+        self._state.task = TS_FORECAST if task == FORECAST else task
         self._state.fit_kwargs = fit_kwargs
         self._validate_data(X_train, y_train, dataframe, label, groups=groups)
 
@@ -1182,15 +1343,17 @@ class AutoML:
                 len(np.unique(self._y_train_all))
             )
         if self._state.task in CLASSIFICATION:
-            assert split_type in [None, "stratified", "uniform", "time", "group"]
+            assert split_type in ["auto", "stratified", "uniform", "time", "group"]
             self._split_type = (
-                split_type or self._state.groups is None and "stratified" or "group"
+                split_type
+                if split_type != "auto"
+                else self._state.groups is None and "stratified" or "group"
             )
         elif self._state.task in REGRESSION:
-            assert split_type in [None, "uniform", "time", "group"]
-            self._split_type = split_type or "uniform"
+            assert split_type in ["auto", "uniform", "time", "group"]
+            self._split_type = split_type if split_type != "auto" else "uniform"
         elif self._state.task == TS_FORECAST:
-            assert split_type in [None, "time"]
+            assert split_type in ["auto", "time"]
             self._split_type = "time"
             assert isinstance(
                 self._state.fit_kwargs.get("period"), int
@@ -1199,7 +1362,7 @@ class AutoML:
             assert (
                 self._state.groups is not None
             ), "groups must be specified for ranking task."
-            assert split_type in [None, "group"]
+            assert split_type in ["auto", "group"]
             self._split_type = "group"
 
     def _decide_eval_method(self, time_budget):
@@ -1410,44 +1573,44 @@ class AutoML:
         y_train=None,
         dataframe=None,
         label=None,
-        metric="auto",
-        task="classification",
-        n_jobs=-1,
+        metric=None,
+        task=None,
+        n_jobs=None,
         gpu_per_trial=0,
-        log_file_name="flaml.log",
-        estimator_list="auto",
-        time_budget=60,
-        max_iter=1000000,
-        sample=True,
-        ensemble=False,
-        eval_method="auto",
-        log_type="better",
-        model_history=False,
-        split_ratio=SPLIT_RATIO,
-        n_splits=N_SPLITS,
-        log_training_metric=False,
-        mem_thres=MEM_THRES,
-        pred_time_limit=np.inf,
-        train_time_limit=np.inf,
+        log_file_name=None,
+        estimator_list=None,
+        time_budget=None,
+        max_iter=None,
+        sample=None,
+        ensemble=None,
+        eval_method=None,
+        log_type=None,
+        model_history=None,
+        split_ratio=None,
+        n_splits=None,
+        log_training_metric=None,
+        mem_thres=None,
+        pred_time_limit=None,
+        train_time_limit=None,
         X_val=None,
         y_val=None,
         sample_weight_val=None,
         groups_val=None,
         groups=None,
-        verbose=3,
-        retrain_full=True,
+        verbose=None,
+        retrain_full=None,
         split_type=None,
-        learner_selector="sample",
+        learner_selector=None,
         hpo_method=None,
-        starting_points={},
+        starting_points=None,
         seed=None,
-        n_concurrent_trials=1,
-        keep_search_state=False,
-        early_stop=False,
-        append_log=False,
-        auto_augment=True,
-        min_sample_size=MIN_SAMPLE_TRAIN,
-        use_ray=False,
+        n_concurrent_trials=None,
+        keep_search_state=None,
+        early_stop=None,
+        append_log=None,
+        auto_augment=None,
+        min_sample_size=None,
+        use_ray=None,
         **fit_kwargs,
     ):
         """Find a model for a given task.
@@ -1470,7 +1633,7 @@ class AutoML:
             metric: A string of the metric name or a function,
                 e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
                 'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
-                'mape'.
+                'mape'. Default is 'auto'.
                 If passing a customized metric function, the function needs to
                 have the follwing signature:
 
@@ -1486,10 +1649,12 @@ class AutoML:
                 which returns a float number as the minimization objective,
                 and a dictionary as the metrics to log.
             task: A string of the task type, e.g.,
-                'classification', 'regression', 'ts_forecast', 'rank'.
+                'classification', 'regression', 'ts_forecast', 'rank',
+                'seq-classification', 'seq-regression'.
             n_jobs: An integer of the number of threads for training.
             gpu_per_trial: A float of the number of gpus per trial, only used by TransformersEstimator.
-            log_file_name: A string of the log file name.
+            log_file_name: A string of the log file name. To disable logging,
+                set it to be an empty string "".
             estimator_list: A list of strings for estimator names, or 'auto'
                 e.g.,
 
@@ -1498,6 +1663,7 @@ class AutoML:
                     ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree']
 
             time_budget: A float number of the time budget in seconds.
+                Use -1 if no time limit.
             max_iter: An integer of the maximal number of iterations.
             sample: A boolean of whether to sample the training data during
                 search.
@@ -1513,9 +1679,8 @@ class AutoML:
                 ['better', 'all'].
                 'better' only logs configs with better loss than previos iters
                 'all' logs all the tried configs.
-            model_history: A boolean of whether to keep the history of best
-                models in the history property. Make sure memory is large
-                enough if setting to True.
+            model_history: A boolean of whether to keep the best
+                model per estimator. Make sure memory is large enough if setting to True.
             log_training_metric: A boolean of whether to log the training
                 metric for each model.
             mem_thres: A float of the memory size constraint in bytes.
@@ -1538,14 +1703,14 @@ class AutoML:
                 True - retrain only after search finishes; False - no retraining;
                 'budget' - do best effort to retrain without violating the time
                 budget.
-            split_type: str or None, default=None | the data split type.
+            split_type: str, default="auto" | the data split type.
                 For classification tasks, valid choices are [
-                    None, 'stratified', 'uniform', 'time']. None -> stratified.
-                For regression tasks, valid choices are [None, 'uniform', 'time'].
-                    None -> uniform.
-                For ts_forecast tasks, must be None or 'time'.
-                For ranking task, must be None or 'group'.
-            hpo_method: str or None, default=None | The hyperparameter
+                    "auto", 'stratified', 'uniform', 'time']. "auto" -> stratified.
+                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
+                    "auto" -> uniform.
+                For ts_forecast tasks, must be "auto" or 'time'.
+                For ranking task, must be "auto" or 'group'.
+            hpo_method: str, default="auto" | The hyperparameter
                 optimization method. By default, CFO is used for sequential
                 search and BlendSearch is used for parallel search.
                 No need to set when using flaml's default search space or using
@@ -1600,10 +1765,74 @@ class AutoML:
         """
 
         self._state._start_time_flag = self._start_time_flag = time.time()
-        if task == FORECAST:
-            self._state.task = TS_FORECAST
-        else:
-            self._state.task = task
+        task = task or self._settings.get("task")
+        time_budget = time_budget or self._settings.get("time_budget")
+        n_jobs = n_jobs or self._settings.get("n_jobs")
+        gpu_per_trial = (
+            self._settings.get("gpu_per_trial")
+            if gpu_per_trial is None
+            else gpu_per_trial
+        )
+        eval_method = eval_method or self._settings.get("eval_method")
+        split_ratio = split_ratio or self._settings.get("split_ratio")
+        n_splits = n_splits or self._settings.get("n_splits")
+        auto_augment = (
+            self._settings.get("auto_augment") if auto_augment is None else auto_augment
+        )
+        metric = metric or self._settings.get("metric")
+        estimator_list = estimator_list or self._settings.get("estimator_list")
+        log_file_name = (
+            self._settings.get("log_file_name")
+            if log_file_name is None
+            else log_file_name
+        )
+        max_iter = self._settings.get("max_iter") if max_iter is None else max_iter
+        sample = self._settings.get("sample") if sample is None else sample
+        ensemble = self._settings.get("ensemble") if ensemble is None else ensemble
+        log_type = log_type or self._settings.get("log_type")
+        model_history = (
+            self._settings.get("model_history")
+            if model_history is None
+            else model_history
+        )
+        log_training_metric = (
+            self._settings.get("log_training_metric")
+            if log_training_metric is None
+            else log_training_metric
+        )
+        mem_thres = mem_thres or self._settings.get("mem_thres")
+        pred_time_limit = pred_time_limit or self._settings.get("pred_time_limit")
+        train_time_limit = train_time_limit or self._settings.get("train_time_limit")
+        verbose = self._settings.get("verbose") if verbose is None else verbose
+        retrain_full = (
+            self._settings.get("retrain_full") if retrain_full is None else retrain_full
+        )
+        split_type = split_type or self._settings.get("split_type")
+        hpo_method = hpo_method or self._settings.get("hpo_method")
+        learner_selector = learner_selector or self._settings.get("learner_selector")
+        starting_points = (
+            self._settings.get("starting_points")
+            if starting_points is None
+            else starting_points
+        )
+        n_concurrent_trials = n_concurrent_trials or self._settings.get(
+            "n_concurrent_trials"
+        )
+        keep_search_state = (
+            self._settings.get("keep_search_state")
+            if keep_search_state is None
+            else keep_search_state
+        )
+        early_stop = (
+            self._settings.get("early_stop") if early_stop is None else early_stop
+        )
+        append_log = (
+            self._settings.get("append_log") if append_log is None else append_log
+        )
+        min_sample_size = min_sample_size or self._settings.get("min_sample_size")
+        use_ray = self._settings.get("use_ray") if use_ray is None else use_ray
+
+        self._state.task = TS_FORECAST if task == FORECAST else task
         self._state.log_training_metric = log_training_metric
 
         self._state.fit_kwargs = fit_kwargs
@@ -1634,13 +1863,24 @@ class AutoML:
         self._state.eval_method = eval_method
         logger.info("Evaluation method: {}".format(eval_method))
 
+        self._state.n_jobs = n_jobs
+        self._n_concurrent_trials = n_concurrent_trials
+        self._early_stop = early_stop
+        self._use_ray = use_ray or n_concurrent_trials > 1
+        # use the following condition if we have an estimation of average_trial_time and average_trial_overhead
+        # self._use_ray = use_ray or n_concurrent_trials > ( average_trail_time + average_trial_overhead) / (average_trial_time)
+        self._state.resources_per_trial = (
+            {"cpu": int(os.cpu_count() / n_concurrent_trials), "gpu": gpu_per_trial}
+            if n_jobs < 0
+            else {"cpu": n_jobs, "gpu": gpu_per_trial}
+        )
         self._retrain_in_budget = retrain_full == "budget" and (
             eval_method == "holdout" and self._state.X_val is None
         )
-        self._retrain_final = (
+        self._state.retrain_final = (
             retrain_full is True
             and eval_method == "holdout"
-            and self._state.X_val is None
+            and (self._state.X_val is None or self._use_ray)
             or eval_method == "cv"
             and (max_iter > 0 or retrain_full is True)
             or max_iter == 1
@@ -1728,7 +1968,7 @@ class AutoML:
             )
         logger.info("List of ML learners in AutoML Run: {}".format(estimator_list))
         self.estimator_list = estimator_list
-        self._state.time_budget = time_budget or 1e10
+        self._state.time_budget = time_budget if time_budget > 0 else 1e10
         self._active_estimators = estimator_list.copy()
         self._ensemble = ensemble
         self._max_iter = max_iter
@@ -1737,24 +1977,15 @@ class AutoML:
         self._state.train_time_limit = train_time_limit
         self._log_type = log_type
         self.split_ratio = split_ratio
-        self._state.save_model_history = model_history
-        self._state.n_jobs = n_jobs
-        import os
-
-        self._state.resources_per_trial = (
-            {"cpu": int(os.cpu_count() / n_concurrent_trials), "gpu": gpu_per_trial}
-            if self._state.n_jobs < 0
-            else {"cpu": self._state.n_jobs, "gpu": gpu_per_trial}
-        )
-        self._n_concurrent_trials = n_concurrent_trials
-        self._early_stop = early_stop
-        self._use_ray = use_ray or n_concurrent_trials > 1
-        # use the following condition if we have an estimation of average_trial_time and average_trial_overhead
-        # self._use_ray = use_ray or n_concurrent_trials > ( average_trail_time + average_trial_overhead) / (average_trial_time)
-        self._hpo_method = hpo_method or (
-            "bs"
-            if n_concurrent_trials > 1 or self._use_ray and len(estimator_list) > 1
-            else "cfo"
+        self._state.model_history = model_history
+        self._hpo_method = (
+            hpo_method
+            if hpo_method != "auto"
+            else (
+                "bs"
+                if n_concurrent_trials > 1 or self._use_ray and len(estimator_list) > 1
+                else "cfo"
+            )
         )
         if log_file_name:
             with training_log_writer(log_file_name, append_log) as save_helper:
@@ -1770,7 +2001,7 @@ class AutoML:
             )
             if (
                 self._hpo_method in ("cfo", "bs")
-                and (self._time_taken_best_iter >= time_budget * 0.7)
+                and (self._time_taken_best_iter >= self._state.time_budget * 0.7)
                 and not all(
                     state.search_alg and state.search_alg.searcher.is_ls_ever_converged
                     for state in self._search_states.values()
@@ -1780,7 +2011,7 @@ class AutoML:
                     "Time taken to find the best model is {0:.0f}% of the "
                     "provided time budget and not all estimators' hyperparameter "
                     "search converged. Consider increasing the time budget.".format(
-                        self._time_taken_best_iter / time_budget * 100
+                        self._time_taken_best_iter / self._state.time_budget * 100
                     )
                 )
 
@@ -1791,9 +2022,6 @@ class AutoML:
             del self._state.y_train, self._state.y_train_all, self._state.y_val
             del self._sample_weight_full, self._state.fit_kwargs
             del self._state.groups, self._state.groups_all, self._state.groups_val
-            for state in self._search_states.values():
-                if state.trained_estimator:
-                    del state.trained_estimator
         # if verbose == 0:
         logger.setLevel(old_level)
 
@@ -1819,7 +2047,7 @@ class AutoML:
         else:
             raise NotImplementedError(
                 f"hpo_method={self._hpo_method} is not recognized. "
-                "'cfo' and 'bs' are supported."
+                "'auto', 'cfo' and 'bs' are supported."
             )
         space = self.search_space
         if self._hpo_method == "random":
@@ -1890,7 +2118,7 @@ class AutoML:
                 config = result["config"]
                 estimator = config.get("ml", config)["learner"]
                 search_state = self._search_states[estimator]
-                search_state.update(result, 0, self._state.save_model_history)
+                search_state.update(result, 0)
                 if result["wall_clock_time"] is not None:
                     self._state.time_from_start = result["wall_clock_time"]
                 if search_state.sample_size == self._state.data_size:
@@ -1905,10 +2133,6 @@ class AutoML:
                         config,
                         self._time_taken_best_iter,
                     )
-                    if self._state.save_model_history:
-                        self._model_history[
-                            _track_iter
-                        ] = search_state.trained_estimator
                     self._trained_estimator = search_state.trained_estimator
                     self._best_iteration = _track_iter
                     self._time_taken_best_iter = self._state.time_from_start
@@ -1961,7 +2185,7 @@ class AutoML:
         better = True  # whether we find a better model in one trial
         if self._ensemble:
             self.best_model = {}
-        if self._max_iter < 2 and self.estimator_list and self._retrain_final:
+        if self._max_iter < 2 and self.estimator_list and self._state.retrain_final:
             # when max_iter is 1, no need to search
             # TODO: otherwise, need to make sure SearchStates.init_config is inside search space
             self._max_iter = 0
@@ -2077,11 +2301,7 @@ class AutoML:
             better = False
             if analysis.trials:
                 result = analysis.trials[-1].last_result
-                search_state.update(
-                    result,
-                    time_used=time_used,
-                    save_model_history=self._state.save_model_history,
-                )
+                search_state.update(result, time_used=time_used)
                 if self._estimator_index is None:
                     # update init eci estimate
                     eci_base = search_state.init_eci
@@ -2123,27 +2343,22 @@ class AutoML:
                         search_state.best_config,
                         self._state.time_from_start,
                     )
-                    if self._state.save_model_history:
-                        self._model_history[
-                            self._track_iter
-                        ] = search_state.trained_estimator
-                    elif self._trained_estimator:
+                    if self._trained_estimator:
+                        self._trained_estimator.cleanup()
                         del self._trained_estimator
                         self._trained_estimator = None
-                    if not self._retrain_final:
+                    if not self._state.retrain_final:
                         self._trained_estimator = search_state.trained_estimator
                     self._best_iteration = self._track_iter
                     self._time_taken_best_iter = self._state.time_from_start
                     better = True
                     next_trial_time = search_state.time2eval_best
-                if search_state.trained_estimator and not (
-                    self._state.save_model_history or self._ensemble
+                if (
+                    search_state.trained_estimator
+                    and not self._state.model_history
+                    and search_state.trained_estimator != self._trained_estimator
                 ):
-                    # free RAM
-                    if search_state.trained_estimator != self._trained_estimator:
-                        search_state.trained_estimator.cleanup()
-                    del search_state.trained_estimator
-                    search_state.trained_estimator = None
+                    search_state.trained_estimator.cleanup()
                 if better or self._log_type == "all":
                     if self._training_log:
                         self._training_log.append(
@@ -2260,7 +2475,6 @@ class AutoML:
         self._estimator_index = None
         self._best_iteration = 0
         self._time_taken_best_iter = 0
-        self._model_history = {}
         self._config_history = {}
         self._max_iter_per_learner = 1000000  # TODO
         self._iter_per_learner = dict([(e, 0) for e in self.estimator_list])
@@ -2371,7 +2585,7 @@ class AutoML:
                         self._trained_estimator.model = stacker
                     else:
                         raise e
-            elif self._retrain_final:
+            elif self._state.retrain_final:
                 # reset time budget for retraining
                 if self._max_iter > 1:
                     self._state.time_from_start -= self._state.time_budget
diff --git a/flaml/model.py b/flaml/model.py
index bf2f3108b..96586bbd4 100644
--- a/flaml/model.py
+++ b/flaml/model.py
@@ -227,7 +227,8 @@ class BaseEstimator:
         return self._model.predict_proba(X_test)
 
     def cleanup(self):
-        pass
+        del self._model
+        self._model = None
 
     @classmethod
     def search_space(cls, **params):
@@ -282,7 +283,7 @@ class BaseEstimator:
 class TransformersEstimator(BaseEstimator):
     """The class for fine-tuning language models, using huggingface transformers API."""
 
-    ITER_HP = "final_global_step"
+    ITER_HP = "global_max_steps"
 
     def __init__(self, task="seq-classification", **config):
         super().__init__(task, **config)
@@ -301,7 +302,7 @@ class TransformersEstimator(BaseEstimator):
                 "domain": tune.loguniform(lower=1e-6, upper=1e-3),
             },
             "num_train_epochs": {
-                "domain": tune.loguniform(lower=0.5, upper=10.0),
+                "domain": tune.loguniform(lower=0.1, upper=10.0),
             },
             "per_device_train_batch_size": {
                 "domain": tune.choice([4, 8, 16, 32]),
@@ -316,7 +317,7 @@ class TransformersEstimator(BaseEstimator):
                 "domain": tune.loguniform(lower=1e-8, upper=1e-6),
             },
             "seed": {"domain": tune.choice(list(range(40, 45)))},
-            "final_global_step": {"domain": sys.maxsize},
+            "global_max_steps": {"domain": sys.maxsize},
         }
 
     def _init_hpo_args(self, automl_fit_kwargs: dict = None):
@@ -356,18 +357,27 @@ class TransformersEstimator(BaseEstimator):
             def on_step_end(self, args, state, control, **callback_kwargs):
                 if state.global_step == 1:
                     self.time_per_iter = time.time() - self.step_begin_time
-                if budget:
-                    if (
+                if (
+                    budget
+                    and (
                         time.time() + self.time_per_iter
                         > self.train_begin_time + budget
-                    ):
-                        control.should_training_stop = True
-                        control.should_save = True
-                        control.should_evaluate = True
-                if state.global_step >= this_params[TransformersEstimator.ITER_HP]:
+                    )
+                    or state.global_step >= this_params[TransformersEstimator.ITER_HP]
+                ):
                     control.should_training_stop = True
+                    control.should_save = True
+                    control.should_evaluate = True
                 return control
 
+            def on_epoch_end(self, args, state, control, **callback_kwargs):
+                if (
+                    control.should_training_stop
+                    or state.epoch + 1 >= this_params["num_train_epochs"]
+                ):
+                    control.should_save = True
+                    control.should_evaluate = True
+
         import transformers
         from transformers import TrainingArguments
         from transformers.trainer_utils import set_seed
@@ -467,36 +477,30 @@ class TransformersEstimator(BaseEstimator):
 
         trainer.train()
 
-        if eval_dataset is not None:
-            # if validation data is non empty, select the best checkpoint and save the final global step to self.params
-
-            self.params[self.ITER_HP] = trainer.state.global_step
-            if trainer.state.global_step > max(trainer.ckpt_to_global_step.values()):
-                trainer.evaluate()
-
-            self._checkpoint_path = self._select_checkpoint(
-                trainer.ckpt_to_metric, trainer.ckpt_to_global_step
-            )
-
-        else:
-            # if validation dataset is empty, save the last checkpoint
-            self._checkpoint_path = self._save_last_checkpoint(trainer)
+        self.params[self.ITER_HP] = trainer.state.global_step
+        self._checkpoint_path = self._select_checkpoint(trainer)
 
         self._kwargs = kwargs
         self._num_labels = num_labels
         self._per_model_config = per_model_config
 
-    def _save_last_checkpoint(self, trainer):
-        this_ckpt = trainer.save_state()
-        self.params[self.ITER_HP] = trainer.state.global_step
-        return this_ckpt
+    def _select_checkpoint(self, trainer):
+        if trainer.ckpt_to_metric:
+            best_ckpt, _ = min(
+                trainer.ckpt_to_metric.items(), key=lambda x: x[1][self._metric_name]
+            )
+            best_ckpt_global_step = trainer.ckpt_to_global_step[best_ckpt]
+        else:
+            best_ckpt_global_step = trainer.state.global_step
+            from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 
-    def _select_checkpoint(self, ckpt_to_score, ckpt_to_global_step):
-        best_ckpt, best_score = min(
-            ckpt_to_score.items(), key=lambda x: x[1][self._metric_name]
-        )
-        best_ckpt_global_step = ckpt_to_global_step[best_ckpt]
+            best_ckpt = os.path.join(
+                trainer.args.output_dir,
+                f"{PREFIX_CHECKPOINT_DIR}-{best_ckpt_global_step}",
+            )
         self.params[self.ITER_HP] = best_ckpt_global_step
+        print(trainer.state.global_step)
+        print(trainer.ckpt_to_global_step)
         return best_ckpt
 
     def _compute_metrics_by_dataset_name(self, eval_pred):
@@ -1339,6 +1343,7 @@ class Prophet(SKLearnEstimator):
         cols = list(train_df)
         cols.remove(TS_TIMESTAMP_COL)
         cols.remove(TS_VALUE_COL)
+        logging.getLogger("prophet").setLevel(logging.WARNING)
         model = Prophet(**self.params)
         for regressor in cols:
             model.add_regressor(regressor)
@@ -1405,9 +1410,8 @@ class ARIMA(Prophet):
         current_time = time.time()
         train_df = self._join(X_train, y_train)
         train_df = self._preprocess(train_df)
-        cols = list(train_df)
-        cols.remove(TS_VALUE_COL)
-        regressors = cols
+        regressors = list(train_df)
+        regressors.remove(TS_VALUE_COL)
         if regressors:
             model = ARIMA_estimator(
                 train_df[[TS_VALUE_COL]],
@@ -1434,14 +1438,12 @@ class ARIMA(Prophet):
             if isinstance(X_test, int):
                 forecast = self._model.forecast(steps=X_test)
             elif isinstance(X_test, DataFrame):
-                first_col = X_test.pop(TS_TIMESTAMP_COL)
-                X_test.insert(0, TS_TIMESTAMP_COL, first_col)
-                start = X_test.iloc[0, 0]
-                end = X_test.iloc[-1, 0]
+                start = X_test[TS_TIMESTAMP_COL].iloc[0]
+                end = X_test[TS_TIMESTAMP_COL].iloc[-1]
                 if len(X_test.columns) > 1:
+                    X_test = self._preprocess(X_test.drop(columns=TS_TIMESTAMP_COL))
                     regressors = list(X_test)
-                    regressors.remove(TS_TIMESTAMP_COL)
-                    X_test = self._preprocess(X_test)
+                    print(start, end, X_test.shape)
                     forecast = self._model.predict(
                         start=start, end=end, exog=X_test[regressors]
                     )
diff --git a/flaml/nlp/README.md b/flaml/nlp/README.md
index 35e42af95..071632481 100644
--- a/flaml/nlp/README.md
+++ b/flaml/nlp/README.md
@@ -6,7 +6,6 @@ An example:
 
 ```python
 from flaml import AutoML
-
 import pandas as pd
 
 train_dataset = pd.read_csv("data/input/train.tsv", delimiter="\t", quoting=3)
diff --git a/flaml/nlp/huggingface/trainer.py b/flaml/nlp/huggingface/trainer.py
index 2b8ae8817..2eb3a4c5c 100644
--- a/flaml/nlp/huggingface/trainer.py
+++ b/flaml/nlp/huggingface/trainer.py
@@ -7,54 +7,25 @@ except ImportError:
 
 
 class TrainerForAuto(TFTrainer):
-    def evaluate(self, eval_dataset=None):
-        """
-        Overriding transformers.Trainer.evaluate by saving state with save_state
-
-        Args:
-            eval_dataset:
-                the dataset to be evaluated
-        """
-
-        if self.eval_dataset is not None:
-            eval_dataloader = self.get_eval_dataloader(self.eval_dataset)
-            output = self.prediction_loop(eval_dataloader, description="Evaluation")
-            self.log(output.metrics)
-
-            ckpt_dir = self.save_state()
-
-            for key in list(output.metrics.keys()):
-                if key.startswith("eval_"):
-                    output.metrics[key[5:]] = output.metrics.pop(key)
-
-            if hasattr(self, "ckpt_to_global_step"):
-                self.ckpt_to_metric[ckpt_dir] = output.metrics
-                self.ckpt_to_global_step[ckpt_dir] = self.state.global_step
-            else:
-                self.ckpt_to_global_step = {ckpt_dir: self.state.global_step}
-                self.ckpt_to_metric = {ckpt_dir: output.metrics}
-
-    def save_state(self):
-        """
-        Overriding transformers.Trainer.save_state. It is only through saving
-        the states can best_trial.get_best_checkpoint return a non-empty value.
-        """
-        import torch
+    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
+        """Overriding transformers.Trainer.evaluate by saving metrics and checkpoint path"""
         from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
-        from ray import tune
 
-        with tune.checkpoint_dir(step=self.state.global_step) as checkpoint_dir:
-            self.args.output_dir = checkpoint_dir
-            # This is the directory name that Huggingface requires.
-            output_dir = os.path.join(
-                self.args.output_dir,
-                f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}",
-            )
-            self.save_model(output_dir)
-            torch.save(
-                self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")
-            )
-            torch.save(
-                self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")
-            )
-            return output_dir
+        ckpt_dir = os.path.join(
+            self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+        )
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+        metrics = eval_dataset and super().evaluate(
+            eval_dataset, ignore_keys, metric_key_prefix
+        )
+        if metrics:
+            for key in list(metrics.keys()):
+                if key.startswith("eval_"):
+                    metrics[key[5:]] = metrics.pop(key)
+        if hasattr(self, "ckpt_to_global_step"):
+            self.ckpt_to_global_step[ckpt_dir] = self.state.global_step
+            if metrics:
+                self.ckpt_to_metric[ckpt_dir] = metrics
+        else:
+            self.ckpt_to_global_step = {ckpt_dir: self.state.global_step}
+            self.ckpt_to_metric = {ckpt_dir: metrics} if metrics else {}
diff --git a/flaml/version.py b/flaml/version.py
index bc8c296f6..777f190df 100644
--- a/flaml/version.py
+++ b/flaml/version.py
@@ -1 +1 @@
-__version__ = "0.7.2"
+__version__ = "0.8.0"
diff --git a/notebook/flaml_forecast.ipynb b/notebook/flaml_forecast.ipynb
index 405aa126c..14e6f327f 100644
--- a/notebook/flaml_forecast.ipynb
+++ b/notebook/flaml_forecast.ipynb
@@ -119,9 +119,9 @@
    "source": [
     "'''The main flaml automl API'''\n",
     "automl.fit(dataframe=train_df,  # training data\n",
-    "           label='co2',  # For 'forecast' task, label should be a tuple of strings for timestamp and value columns\n",
-    "           **settings, \n",
-    "           period=time_horizon)  # key word argument 'period' must be included for forecast task)"
+    "           label='co2',  # label column\n",
+    "           period=time_horizon,  # key word argument 'period' must be included for forecast task)\n",
+    "           **settings)"
    ]
   },
   {
@@ -173,8 +173,8 @@
    "source": [
     "''' compute predictions of testing dataset '''\n",
     "flaml_y_pred = automl.predict(X_test)\n",
-    "print('Predicted labels', flaml_y_pred)\n",
-    "print('True labels', y_test)"
+    "print(f\"Predicted labels\\n{flaml_y_pred}\")\n",
+    "print(f\"True labels\\n{y_test}\")"
    ]
   },
   {
diff --git a/test/test_classification.py b/test/automl/test_classification.py
similarity index 97%
rename from test/test_classification.py
rename to test/automl/test_classification.py
index 6e0515431..29507e08f 100644
--- a/test/test_classification.py
+++ b/test/automl/test_classification.py
@@ -216,7 +216,7 @@ class TestClassification(unittest.TestCase):
         print(automl_experiment.predict(X_train))
         print(automl_experiment.model)
         print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("xgboost"))
         print(automl_experiment.best_iteration)
         print(automl_experiment.best_estimator)
 
@@ -253,7 +253,7 @@ class TestClassification(unittest.TestCase):
             print(automl_experiment.predict(X_train))
             print(automl_experiment.model)
             print(automl_experiment.config_history)
-            print(automl_experiment.model_history)
+            print(automl_experiment.best_model_for_estimator("xgboost"))
             print(automl_experiment.best_iteration)
             print(automl_experiment.best_estimator)
         except ImportError:
@@ -286,7 +286,7 @@ class TestClassification(unittest.TestCase):
             print(automl_experiment.predict(X_train))
             print(automl_experiment.model)
             print(automl_experiment.config_history)
-            print(automl_experiment.model_history)
+            print(automl_experiment.best_model_for_estimator("large_lgbm"))
             print(automl_experiment.best_iteration)
             print(automl_experiment.best_estimator)
         except ImportError:
@@ -314,7 +314,7 @@ class TestClassification(unittest.TestCase):
         print(automl_experiment.predict(X_train))
         print(automl_experiment.model)
         print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("lrl2"))
         print(automl_experiment.best_iteration)
         print(automl_experiment.best_estimator)
 
diff --git a/test/test_forecast.py b/test/automl/test_forecast.py
similarity index 100%
rename from test/test_forecast.py
rename to test/automl/test_forecast.py
diff --git a/test/test_multiclass.py b/test/automl/test_multiclass.py
similarity index 98%
rename from test/test_multiclass.py
rename to test/automl/test_multiclass.py
index 096fc6288..9c80e66a5 100644
--- a/test/test_multiclass.py
+++ b/test/automl/test_multiclass.py
@@ -198,7 +198,7 @@ class TestMultiClass(unittest.TestCase):
         print(automl_experiment.classes_)
         print(automl_experiment.model)
         print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("rf"))
         print(automl_experiment.best_iteration)
         print(automl_experiment.best_estimator)
         automl_experiment = AutoML()
@@ -238,13 +238,13 @@ class TestMultiClass(unittest.TestCase):
         print(automl_experiment.predict(X_train)[:5])
         print(automl_experiment.model)
         print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("catboost"))
         print(automl_experiment.best_iteration)
         print(automl_experiment.best_estimator)
         del automl_settings["metric"]
         del automl_settings["model_history"]
         del automl_settings["log_training_metric"]
-        automl_experiment = AutoML()
+        automl_experiment = AutoML(task="classification")
         duration = automl_experiment.retrain_from_log(
             log_file_name=automl_settings["log_file_name"],
             X_train=X_train,
@@ -333,7 +333,7 @@ class TestMultiClass(unittest.TestCase):
         print(automl_experiment.predict_proba(X_train))
         print(automl_experiment.model)
         print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("extra_tree"))
         print(automl_experiment.best_iteration)
         print(automl_experiment.best_estimator)
 
@@ -343,7 +343,7 @@ class TestMultiClass(unittest.TestCase):
             learner_name="large_lgbm", learner_class=MyLargeLGBM
         )
         automl_settings = {
-            "time_budget": None,
+            "time_budget": -1,
             "task": "classification",
             "log_file_name": "test/classification_oom.log",
             "estimator_list": ["large_lgbm"],
diff --git a/test/test_notebook_example.py b/test/automl/test_notebook_example.py
similarity index 100%
rename from test/test_notebook_example.py
rename to test/automl/test_notebook_example.py
diff --git a/test/test_python_log.py b/test/automl/test_python_log.py
similarity index 100%
rename from test/test_python_log.py
rename to test/automl/test_python_log.py
diff --git a/test/test_regression.py b/test/automl/test_regression.py
similarity index 85%
rename from test/test_regression.py
rename to test/automl/test_regression.py
index e33110bf2..44bf20615 100644
--- a/test/test_regression.py
+++ b/test/automl/test_regression.py
@@ -56,7 +56,7 @@ class TestRegression(unittest.TestCase):
         print(automl_experiment.predict(X_train))
         print(automl_experiment.model)
         print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("xgboost"))
         print(automl_experiment.best_iteration)
         print(automl_experiment.best_estimator)
         print(get_output_from_log(automl_settings["log_file_name"], 1))
@@ -77,28 +77,6 @@ class TestRegression(unittest.TestCase):
             time_budget=0,
         )
 
-    def test_sparse_matrix_classification(self):
-        automl_experiment = AutoML()
-        automl_settings = {
-            "time_budget": 2,
-            "metric": "auto",
-            "task": "classification",
-            "log_file_name": "test/sparse_classification.log",
-            "split_type": "uniform",
-            "n_jobs": 1,
-            "model_history": True,
-        }
-        X_train = scipy.sparse.random(1554, 21, dtype=int)
-        y_train = np.random.randint(3, size=1554)
-        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-        print(automl_experiment.classes_)
-        print(automl_experiment.predict_proba(X_train))
-        print(automl_experiment.model)
-        print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
-        print(automl_experiment.best_iteration)
-        print(automl_experiment.best_estimator)
-
     def test_sparse_matrix_regression(self):
         X_train = scipy.sparse.random(300, 900, density=0.0001)
         y_train = np.random.uniform(size=300)
@@ -127,7 +105,7 @@ class TestRegression(unittest.TestCase):
         print(automl_experiment.predict(X_train))
         print(automl_experiment.model)
         print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("rf"))
         print(automl_experiment.best_iteration)
         print(automl_experiment.best_estimator)
         print(automl_experiment.best_config)
@@ -151,7 +129,7 @@ class TestRegression(unittest.TestCase):
             print(automl_experiment.predict(X_train))
             print(automl_experiment.model)
             print(automl_experiment.config_history)
-            print(automl_experiment.model_history)
+            print(automl_experiment.best_model_for_estimator("xgboost"))
             print(automl_experiment.best_iteration)
             print(automl_experiment.best_estimator)
         except ImportError:
@@ -176,7 +154,7 @@ class TestRegression(unittest.TestCase):
         print(automl_experiment.predict(X_train))
         print(automl_experiment.model)
         print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("rf"))
         print(automl_experiment.best_iteration)
         print(automl_experiment.best_estimator)
 
@@ -209,7 +187,7 @@ class TestRegression(unittest.TestCase):
         print(automl_experiment.predict(X_train))
         print(automl_experiment.model)
         print(automl_experiment.config_history)
-        print(automl_experiment.model_history)
+        print(automl_experiment.best_model_for_estimator("my_xgb2"))
         print(automl_experiment.best_iteration)
         print(automl_experiment.best_estimator)
         print(automl_experiment.best_config)
diff --git a/test/test_split.py b/test/automl/test_split.py
similarity index 100%
rename from test/test_split.py
rename to test/automl/test_split.py
diff --git a/test/test_training_log.py b/test/automl/test_training_log.py
similarity index 97%
rename from test/test_training_log.py
rename to test/automl/test_training_log.py
index 3c32310a0..ac98c495c 100644
--- a/test/test_training_log.py
+++ b/test/automl/test_training_log.py
@@ -30,7 +30,6 @@ class TestTrainingLog(unittest.TestCase):
                 # "ensemble": True,
                 "keep_search_state": True,
                 "estimator_list": estimator_list,
-                "model_history": True,
             }
             X_train, y_train = fetch_california_housing(return_X_y=True)
             automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
@@ -85,7 +84,7 @@ class TestTrainingLog(unittest.TestCase):
                         count += 1
                     self.assertGreater(count, 0)
 
-            automl_settings["log_file_name"] = None
+            automl_settings["log_file_name"] = ""
             automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
             automl._selected.update(None, 0)
             automl = AutoML()
diff --git a/test/test_xgboost2d.py b/test/automl/test_xgboost2d.py
similarity index 99%
rename from test/test_xgboost2d.py
rename to test/automl/test_xgboost2d.py
index 2eebf911f..a73b5b68e 100644
--- a/test/test_xgboost2d.py
+++ b/test/automl/test_xgboost2d.py
@@ -2,7 +2,6 @@ import unittest
 
 from sklearn.datasets import fetch_openml
 from sklearn.model_selection import train_test_split
-import numpy as np
 from flaml.automl import AutoML
 from flaml.model import XGBoostSklearnEstimator
 from flaml import tune
diff --git a/test/test_xgboost2d_sample_size.py b/test/automl/test_xgboost2d_sample_size.py
similarity index 97%
rename from test/test_xgboost2d_sample_size.py
rename to test/automl/test_xgboost2d_sample_size.py
index eabf69b74..bdc7f8c1e 100644
--- a/test/test_xgboost2d_sample_size.py
+++ b/test/automl/test_xgboost2d_sample_size.py
@@ -2,7 +2,6 @@ import unittest
 
 from sklearn.datasets import fetch_openml
 from sklearn.model_selection import train_test_split
-import numpy as np
 from flaml.automl import AutoML
 from flaml.model import XGBoostSklearnEstimator
 from flaml import tune
@@ -44,7 +43,6 @@ def _test_simple(method=None, size_ratio=1.0):
         # "metric": 'accuracy',
         "task": "classification",
         "log_file_name": f"test/xgboost2d_{dataset}_{method}_{final_size}.log",
-        # "model_history": True,
         # "log_training_metric": True,
         # "split_type": split_type,
         "n_jobs": 1,
diff --git a/test/test_autohf.py b/test/nlp/test_autohf.py
similarity index 93%
rename from test/test_autohf.py
rename to test/nlp/test_autohf.py
index aa03ad60a..d734d10ac 100644
--- a/test/test_autohf.py
+++ b/test/nlp/test_autohf.py
@@ -1,3 +1,8 @@
+import os
+import pytest
+
+
+@pytest.mark.skipif(os.name == "posix", reason="do not run on mac os")
 def test_hf_data():
     try:
         import ray
@@ -33,15 +38,15 @@ def test_hf_data():
     automl_settings = {
         "gpu_per_trial": 0,
         "max_iter": 3,
-        "time_budget": 20,
+        "time_budget": 5,
         "task": "seq-classification",
         "metric": "accuracy",
-        "model_history": True,
+        "log_file_name": "seqclass.log",
     }
 
     automl_settings["custom_hpo_args"] = {
         "model_path": "google/electra-small-discriminator",
-        "output_dir": "data/output/",
+        "output_dir": "test/data/output/",
         "ckpt_per_epoch": 5,
         "fp16": False,
     }
@@ -51,7 +56,6 @@ def test_hf_data():
     )
     automl = AutoML()
     automl.retrain_from_log(
-        log_file_name="flaml.log",
         X_train=X_train,
         y_train=y_train,
         train_full=True,
@@ -71,10 +75,6 @@ def test_hf_data():
 
 
 def _test_custom_data():
-    try:
-        import ray
-    except ImportError:
-        return
     from flaml import AutoML
 
     import pandas as pd
diff --git a/test/test_autohf_classificationhead.py b/test/nlp/test_autohf_classificationhead.py
similarity index 85%
rename from test/test_autohf_classificationhead.py
rename to test/nlp/test_autohf_classificationhead.py
index d32adbd53..c81cd1069 100644
--- a/test/test_autohf_classificationhead.py
+++ b/test/nlp/test_autohf_classificationhead.py
@@ -1,8 +1,4 @@
 def test_classification_head():
-    try:
-        import ray
-    except ImportError:
-        return
     from flaml import AutoML
 
     from datasets import load_dataset
@@ -24,15 +20,14 @@ def test_classification_head():
     automl_settings = {
         "gpu_per_trial": 0,
         "max_iter": 3,
-        "time_budget": 20,
+        "time_budget": 5,
         "task": "seq-classification",
         "metric": "accuracy",
-        "model_history": True,
     }
 
     automl_settings["custom_hpo_args"] = {
         "model_path": "google/electra-small-discriminator",
-        "output_dir": "data/output/",
+        "output_dir": "test/data/output/",
         "ckpt_per_epoch": 5,
         "fp16": False,
     }
diff --git a/test/test_autohf_cv.py b/test/nlp/test_autohf_cv.py
similarity index 82%
rename from test/test_autohf_cv.py
rename to test/nlp/test_autohf_cv.py
index 6a0ecfdac..0e75a32ca 100644
--- a/test/test_autohf_cv.py
+++ b/test/nlp/test_autohf_cv.py
@@ -1,8 +1,9 @@
+import os
+import pytest
+
+
+@pytest.mark.skipif(os.name == "posix", reason="do not run on mac os")
 def test_cv():
-    try:
-        import ray
-    except ImportError:
-        return
     from flaml import AutoML
 
     from datasets import load_dataset
@@ -22,16 +23,15 @@ def test_cv():
     automl_settings = {
         "gpu_per_trial": 0,
         "max_iter": 3,
-        "time_budget": 20,
+        "time_budget": 5,
         "task": "seq-classification",
         "metric": "accuracy",
         "n_splits": 3,
-        "model_history": True,
     }
 
     automl_settings["custom_hpo_args"] = {
         "model_path": "google/electra-small-discriminator",
-        "output_dir": "data/output/",
+        "output_dir": "test/data/output/",
         "ckpt_per_epoch": 1,
         "fp16": False,
     }
diff --git a/test/test_autohf_loadargs.py b/test/nlp/test_autohf_loadargs.py
similarity index 100%
rename from test/test_autohf_loadargs.py
rename to test/nlp/test_autohf_loadargs.py
diff --git a/test/test_autohf_regression.py b/test/nlp/test_autohf_regression.py
similarity index 73%
rename from test/test_autohf_regression.py
rename to test/nlp/test_autohf_regression.py
index fffeb8d0e..b2c4cd6bf 100644
--- a/test/test_autohf_regression.py
+++ b/test/nlp/test_autohf_regression.py
@@ -1,17 +1,18 @@
+import os
+import pytest
+
+
+@pytest.mark.skipif(os.name == "posix", reason="do not run on mac os")
 def test_regression():
-    try:
-        import ray
-    except ImportError:
-        return
     from flaml import AutoML
 
     from datasets import load_dataset
 
     train_dataset = (
-        load_dataset("glue", "stsb", split="train[:1%]").to_pandas().iloc[0:4]
+        load_dataset("glue", "stsb", split="train[:1%]").to_pandas().iloc[:20]
     )
     dev_dataset = (
-        load_dataset("glue", "stsb", split="train[1%:2%]").to_pandas().iloc[0:4]
+        load_dataset("glue", "stsb", split="train[1%:2%]").to_pandas().iloc[:20]
     )
 
     custom_sent_keys = ["sentence1", "sentence2"]
@@ -27,16 +28,16 @@ def test_regression():
 
     automl_settings = {
         "gpu_per_trial": 0,
-        "max_iter": 3,
-        "time_budget": 20,
+        "max_iter": 2,
+        "time_budget": 5,
         "task": "seq-regression",
         "metric": "rmse",
-        "model_history": True,
+        "starting_points": {"transformer": {"num_train_epochs": 1}},
     }
 
     automl_settings["custom_hpo_args"] = {
         "model_path": "google/electra-small-discriminator",
-        "output_dir": "data/output/",
+        "output_dir": "test/data/output/",
         "ckpt_per_epoch": 5,
         "fp16": False,
     }
@@ -44,3 +45,7 @@ def test_regression():
     automl.fit(
         X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
     )
+
+
+if __name__ == "main":
+    test_regression()