refactoring TransformersEstimator to support default and custom_hp (#511)

* refactoring TransformersEstimator to support default and custom_hp * handling starting_points not in search space * addressing starting point more than max_iter * fixing upper < lower bug
2026-04-20 03:02:16 -04:00 · 2022-04-28 14:06:29 -04:00
parent d03038bfcb
commit ca35fa969f
52 changed files with 2639 additions and 2160 deletions
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -4,7 +4,8 @@
 #  * project root for license information.
 import time
 import os
-from typing import Callable, Optional, List, Union
+from typing import Callable, Optional, List, Union, Any
+import inspect
 from functools import partial
 import numpy as np
 from scipy.sparse import issparse
@@ -74,8 +75,46 @@ class SearchState:
            self.total_time_used - self.time_best_found,
        )

+    def valid_starting_point_one_dim(self, value_one_dim, domain_one_dim):
+        from .tune.space import sample
+
+        """
+            For each hp in the starting point, check the following 3 conditions:
+            (1) If the type of the starting point does not match the required type in search space, return false
+            (2) If the starting point is not in the required search space, return false
+            (3) If the search space is a value instead of domain, and the value is not equal to the starting point
+            Notice (2) include the case starting point not in user specified search space custom_hp
+        """
+        if isinstance(domain_one_dim, sample.Domain):
+            renamed_type = list(
+                inspect.signature(domain_one_dim.is_valid).parameters.values()
+            )[0].annotation
+            type_match = renamed_type == Any or isinstance(value_one_dim, renamed_type)
+            if not (type_match and domain_one_dim.is_valid(value_one_dim)):
+                return False
+        elif value_one_dim != domain_one_dim:
+            return False
+        return True
+
+    def valid_starting_point(self, starting_point, search_space):
+        return any(
+            [
+                self.valid_starting_point_one_dim(
+                    value, search_space[name].get("domain")
+                )
+                for name, value in starting_point.items()
+            ]
+        )
+
    def __init__(
-        self, learner_class, data_size, task, starting_point=None, period=None
+        self,
+        learner_class,
+        data_size,
+        task,
+        starting_point=None,
+        period=None,
+        custom_hp=None,
+        max_iter=None,
    ):
        self.init_eci = learner_class.cost_relative2lgbm()
        self._search_space_domain = {}
@@ -91,13 +130,42 @@ class SearchState:
            )
        else:
            search_space = learner_class.search_space(data_size=data_size, task=task)
+
+        if custom_hp is not None:
+            search_space.update(custom_hp)
+
+        if (
+            isinstance(starting_point, dict)
+            and max_iter
+            > 1  # If the number of starting point is larger than max iter, avoid the checking
+            and not self.valid_starting_point(starting_point, search_space)
+        ):
+            logger.warning(
+                "Starting point {} removed because it is outside of the search space".format(
+                    starting_point
+                )
+            )
+            starting_point = None
+        elif isinstance(starting_point, list) and max_iter > len(
+            starting_point
+        ):  # If the number of starting point is larger than max iter, avoid the checking
+            starting_point_len = len(starting_point)
+            starting_point = [
+                x for x in starting_point if self.valid_starting_point(x, search_space)
+            ]
+            if starting_point_len > len(starting_point):
+                logger.warning(
+                    "Starting points outside of the search space are removed. "
+                    f"Remaining starting points: {starting_point}"
+                )
+            starting_point = starting_point or None
+
        for name, space in search_space.items():
            assert (
                "domain" in space
            ), f"{name}'s domain is missing in the search space spec {space}"
            self._search_space_domain[name] = space["domain"]
-            if "init_value" in space:
-                self.init_config[name] = space["init_value"]
+
            if "low_cost_init_value" in space:
                self.low_cost_partial_config[name] = space["low_cost_init_value"]
            if "cat_hp_cost" in space:
@@ -109,9 +177,20 @@ class SearchState:
                and starting_point.get(name) is not None
            ):
                self.init_config[name] = starting_point[name]
+            elif (
+                not isinstance(starting_point, list)
+                and "init_value" in space
+                and self.valid_starting_point_one_dim(
+                    space["init_value"], space["domain"]
+                )
+            ):  # If starting point is list, no need to check the validity of self.init_config w.r.t search space
+                self.init_config[name] = space[
+                    "init_value"
+                ]  # If starting_point is list, no need to assign value to self.init_config here

        if isinstance(starting_point, list):
            self.init_config = starting_point
+
        self._hp_names = list(self._search_space_domain.keys())
        self.search_alg = None
        self.best_config = None
@@ -202,7 +281,9 @@ class AutoMLState:
            else:
                sampled_X_train = self.X_train[:sample_size]
            sampled_y_train = self.y_train[:sample_size]
-            weight = self.fit_kwargs.get("sample_weight")
+            weight = self.fit_kwargs.get(
+                "sample_weight"
+            )  # NOTE: _prepare_sample_train_data is before
            if weight is not None:
                sampled_weight = weight[:sample_size]
            if self.groups is not None:
@@ -210,7 +291,9 @@ class AutoMLState:
        else:
            sampled_X_train = self.X_train_all
            sampled_y_train = self.y_train_all
-            if "sample_weight" in self.fit_kwargs:
+            if (
+                "sample_weight" in self.fit_kwargs
+            ):  # NOTE: _prepare_sample_train_data is before
                sampled_weight = self.sample_weight_all
            if self.groups is not None:
                groups = self.groups_all
@@ -222,6 +305,10 @@ class AutoMLState:
            sample_size = int(config_w_resource["FLAML_sample_size"])
        else:
            sample_size = state.data_size[0]
+
+        this_estimator_kwargs = state.fit_kwargs_by_estimator.get(
+            estimator
+        ).copy()  # NOTE: _compute_with_config_base is after
        (
            sampled_X_train,
            sampled_y_train,
@@ -229,12 +316,10 @@ class AutoMLState:
            groups,
        ) = state._prepare_sample_train_data(sample_size)
        if sampled_weight is not None:
-            weight = state.fit_kwargs["sample_weight"]
-            state.fit_kwargs["sample_weight"] = sampled_weight
-        else:
-            weight = None
+            weight = this_estimator_kwargs["sample_weight"]
+            this_estimator_kwargs["sample_weight"] = sampled_weight
        if groups is not None:
-            state.fit_kwargs["groups"] = groups
+            this_estimator_kwargs["groups"] = groups
        config = config_w_resource.copy()
        if "FLAML_sample_size" in config:
            del config["FLAML_sample_size"]
@@ -275,15 +360,11 @@ class AutoMLState:
            state.n_jobs,
            state.learner_classes.get(estimator),
            state.log_training_metric,
-            state.fit_kwargs,
+            this_estimator_kwargs,
        )
        if state.retrain_final and not state.model_history:
            trained_estimator.cleanup()

-        if _is_nlp_task(state.task):
-            del state.fit_kwargs["X_val"]
-            del state.fit_kwargs["y_val"]
-
        result = {
            "pred_time": pred_time,
            "wall_clock_time": time.time() - state._start_time_flag,
@@ -292,7 +373,7 @@ class AutoMLState:
            "trained_estimator": trained_estimator,
        }
        if sampled_weight is not None:
-            state.fit_kwargs["sample_weight"] = weight
+            this_estimator_kwargs["sample_weight"] = weight
        tune.report(**result)
        return result

@@ -311,6 +392,10 @@ class AutoMLState:
            del config["FLAML_sample_size"]
        if "learner" in config:
            del config["learner"]
+
+        this_estimator_kwargs = self.fit_kwargs_by_estimator.get(
+            estimator
+        ).copy()  # NOTE: _train_with_config is after
        (
            sampled_X_train,
            sampled_y_train,
@@ -318,12 +403,16 @@ class AutoMLState:
            groups,
        ) = self._prepare_sample_train_data(sample_size)
        if sampled_weight is not None:
-            weight = self.fit_kwargs["sample_weight"]
-            self.fit_kwargs["sample_weight"] = sampled_weight
-        else:
-            weight = None
+            weight = this_estimator_kwargs[
+                "sample_weight"
+            ]  # NOTE: _train_with_config is after
+            this_estimator_kwargs[
+                "sample_weight"
+            ] = sampled_weight  # NOTE: _train_with_config is after
        if groups is not None:
-            self.fit_kwargs["groups"] = groups
+            this_estimator_kwargs[
+                "groups"
+            ] = groups  # NOTE: _train_with_config is after

        budget = (
            None
@@ -340,12 +429,14 @@ class AutoMLState:
            n_jobs=self.n_jobs,
            estimator_class=self.learner_classes.get(estimator),
            budget=budget,
-            fit_kwargs=self.fit_kwargs,
+            fit_kwargs=this_estimator_kwargs,  # NOTE: _train_with_config is after
            eval_metric=self.metric if hasattr(self, "metric") else "train_time",
        )

        if sampled_weight is not None:
-            self.fit_kwargs["sample_weight"] = weight
+            this_estimator_kwargs[
+                "sample_weight"
+            ] = weight  # NOTE: _train_with_config is after

        return estimator, train_time

@@ -384,169 +475,200 @@ class AutoML(BaseEstimator):
    def __init__(self, **settings):
        """Constructor.

-        Many settings in fit() can be passed to the constructor too.
-        If an argument in fit() is provided, it will override the setting passed to the constructor.
-        If an argument in fit() is not provided but provided in the constructor, the value passed to the constructor will be used.
+         Many settings in fit() can be passed to the constructor too.
+         If an argument in fit() is provided, it will override the setting passed to the constructor.
+         If an argument in fit() is not provided but provided in the constructor, the value passed to the constructor will be used.

-        Args:
-            metric: A string of the metric name or a function,
-                e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
-                'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
-                'mape'. Default is 'auto'.
-                If passing a customized metric function, the function needs to
-                have the follwing signature:
-        ```python
-        def custom_metric(
-            X_test, y_test, estimator, labels,
-            X_train, y_train, weight_test=None, weight_train=None,
-            config=None, groups_test=None, groups_train=None,
-        ):
-            return metric_to_minimize, metrics_to_log
-        ```
-                which returns a float number as the minimization objective,
-                and a dictionary as the metrics to log. E.g.,
-        ```python
-        def custom_metric(
-            X_val, y_val, estimator, labels,
-            X_train, y_train, weight_val=None, weight_train=None,
-            *args,
-        ):
-            from sklearn.metrics import log_loss
-            import time
+         Args:
+             metric: A string of the metric name or a function,
+                 e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
+                 'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
+                 'mape'. Default is 'auto'.
+                 If passing a customized metric function, the function needs to
+                 have the follwing signature:

-            start = time.time()
-            y_pred = estimator.predict_proba(X_val)
-            pred_time = (time.time() - start) / len(X_val)
-            val_loss = log_loss(y_val, y_pred, labels=labels, sample_weight=weight_val)
-            y_pred = estimator.predict_proba(X_train)
-            train_loss = log_loss(y_train, y_pred, labels=labels, sample_weight=weight_train)
-            alpha = 0.5
-            return val_loss * (1 + alpha) - alpha * train_loss, {
-                "val_loss": val_loss,
-                "train_loss": train_loss,
-                "pred_time": pred_time,
-            }
-        ```
-            task: A string of the task type, e.g.,
-                'classification', 'regression', 'ts_forecast', 'rank',
-                'seq-classification', 'seq-regression', 'summarization'.
-            n_jobs: An integer of the number of threads for training | default=-1.
-                Use all available resources when n_jobs == -1.
-            log_file_name: A string of the log file name | default="". To disable logging,
-                set it to be an empty string "".
-            estimator_list: A list of strings for estimator names, or 'auto'
-                e.g., ```['lgbm', 'xgboost', 'xgb_limitdepth', 'catboost', 'rf', 'extra_tree']```
-            time_budget: A float number of the time budget in seconds.
-                Use -1 if no time limit.
-            max_iter: An integer of the maximal number of iterations.
-            sample: A boolean of whether to sample the training data during
-                search.
-            ensemble: boolean or dict | default=False. Whether to perform
-                ensemble after search. Can be a dict with keys 'passthrough'
-                and 'final_estimator' to specify the passthrough and
-                final_estimator in the stacker.
-            eval_method: A string of resampling strategy, one of
-                ['auto', 'cv', 'holdout'].
-            split_ratio: A float of the valiation data percentage for holdout.
-            n_splits: An integer of the number of folds for cross - validation.
-            log_type: A string of the log type, one of
-                ['better', 'all'].
-                'better' only logs configs with better loss than previos iters
-                'all' logs all the tried configs.
-            model_history: A boolean of whether to keep the best
-                model per estimator. Make sure memory is large enough if setting to True.
-            log_training_metric: A boolean of whether to log the training
-                metric for each model.
-            mem_thres: A float of the memory size constraint in bytes.
-            pred_time_limit: A float of the prediction latency constraint in seconds.
-                It refers to the average prediction time per row in validation data.
-            train_time_limit: A float of the training time constraint in seconds.
-            verbose: int, default=3 | Controls the verbosity, higher means more
-                messages.
-            retrain_full: bool or str, default=True | whether to retrain the
-                selected model on the full training data when using holdout.
-                True - retrain only after search finishes; False - no retraining;
-                'budget' - do best effort to retrain without violating the time
-                budget.
-            split_type: str or splitter object, default="auto" | the data split type.
-                * A valid splitter object is an instance of a derived class of scikit-learn
-                [KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
-                and have ``split`` and ``get_n_splits`` methods with the same signatures.
-                Set eval_method to "cv" to use the splitter object.
-                * Valid str options depend on different tasks.
-                For classification tasks, valid choices are
-                    ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
-                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
-                    "auto" -> uniform.
-                For ts_forecast tasks, must be "auto" or 'time'.
-                For ranking task, must be "auto" or 'group'.
-            hpo_method: str, default="auto" | The hyperparameter
-                optimization method. By default, CFO is used for sequential
-                search and BlendSearch is used for parallel search.
-                No need to set when using flaml's default search space or using
-                a simple customized search space. When set to 'bs', BlendSearch
-                is used. BlendSearch can be tried when the search space is
-                complex, for example, containing multiple disjoint, discontinuous
-                subspaces. When set to 'random', random search is used.
-            starting_points: A dictionary or a str to specify the starting hyperparameter
-                config for the estimators | default="static".
-                If str:
-                    - if "data", use data-dependent defaults;
-                    - if "data:path" use data-dependent defaults which are stored at path;
-                    - if "static", use data-independent defaults.
-                If dict, keys are the name of the estimators, and values are the starting
-                hyperparamter configurations for the corresponding estimators.
-                The value can be a single hyperparamter configuration dict or a list
-                of hyperparamter configuration dicts.
-                In the following code example, we get starting_points from the
-                `automl` object and use them in the `new_automl` object.
-                e.g.,
+         ```python
+         def custom_metric(
+             X_test, y_test, estimator, labels,
+             X_train, y_train, weight_test=None, weight_train=None,
+             config=None, groups_test=None, groups_train=None,
+         ):
+             return metric_to_minimize, metrics_to_log
+         ```
+                 which returns a float number as the minimization objective,
+                 and a dictionary as the metrics to log. E.g.,
+
+         ```python
+         def custom_metric(
+             X_val, y_val, estimator, labels,
+             X_train, y_train, weight_val=None, weight_train=None,
+             *args,
+         ):
+             from sklearn.metrics import log_loss
+             import time
+
+             start = time.time()
+             y_pred = estimator.predict_proba(X_val)
+             pred_time = (time.time() - start) / len(X_val)
+             val_loss = log_loss(y_val, y_pred, labels=labels, sample_weight=weight_val)
+             y_pred = estimator.predict_proba(X_train)
+             train_loss = log_loss(y_train, y_pred, labels=labels, sample_weight=weight_train)
+             alpha = 0.5
+             return val_loss * (1 + alpha) - alpha * train_loss, {
+                 "val_loss": val_loss,
+                 "train_loss": train_loss,
+                 "pred_time": pred_time,
+             }
+         ```
+             task: A string of the task type, e.g.,
+                 'classification', 'regression', 'ts_forecast', 'rank',
+                 'seq-classification', 'seq-regression', 'summarization'.
+             n_jobs: An integer of the number of threads for training | default=-1.
+                 Use all available resources when n_jobs == -1.
+             log_file_name: A string of the log file name | default="". To disable logging,
+                 set it to be an empty string "".
+             estimator_list: A list of strings for estimator names, or 'auto'
+                 e.g., ```['lgbm', 'xgboost', 'xgb_limitdepth', 'catboost', 'rf', 'extra_tree']```
+             time_budget: A float number of the time budget in seconds.
+                 Use -1 if no time limit.
+             max_iter: An integer of the maximal number of iterations.
+             sample: A boolean of whether to sample the training data during
+                 search.
+             ensemble: boolean or dict | default=False. Whether to perform
+                 ensemble after search. Can be a dict with keys 'passthrough'
+                 and 'final_estimator' to specify the passthrough and
+                 final_estimator in the stacker.
+             eval_method: A string of resampling strategy, one of
+                 ['auto', 'cv', 'holdout'].
+             split_ratio: A float of the valiation data percentage for holdout.
+             n_splits: An integer of the number of folds for cross - validation.
+             log_type: A string of the log type, one of
+                 ['better', 'all'].
+                 'better' only logs configs with better loss than previos iters
+                 'all' logs all the tried configs.
+             model_history: A boolean of whether to keep the best
+                 model per estimator. Make sure memory is large enough if setting to True.
+             log_training_metric: A boolean of whether to log the training
+                 metric for each model.
+             mem_thres: A float of the memory size constraint in bytes.
+             pred_time_limit: A float of the prediction latency constraint in seconds.
+                 It refers to the average prediction time per row in validation data.
+             train_time_limit: A float of the training time constraint in seconds.
+             verbose: int, default=3 | Controls the verbosity, higher means more
+                 messages.
+             retrain_full: bool or str, default=True | whether to retrain the
+                 selected model on the full training data when using holdout.
+                 True - retrain only after search finishes; False - no retraining;
+                 'budget' - do best effort to retrain without violating the time
+                 budget.
+             split_type: str or splitter object, default="auto" | the data split type.
+                 * A valid splitter object is an instance of a derived class of scikit-learn
+                 [KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
+                 and have ``split`` and ``get_n_splits`` methods with the same signatures.
+                 Set eval_method to "cv" to use the splitter object.
+                 * Valid str options depend on different tasks.
+                 For classification tasks, valid choices are
+                     ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
+                 For regression tasks, valid choices are ["auto", 'uniform', 'time'].
+                     "auto" -> uniform.
+                 For ts_forecast tasks, must be "auto" or 'time'.
+                 For ranking task, must be "auto" or 'group'.
+             hpo_method: str, default="auto" | The hyperparameter
+                 optimization method. By default, CFO is used for sequential
+                 search and BlendSearch is used for parallel search.
+                 No need to set when using flaml's default search space or using
+                 a simple customized search space. When set to 'bs', BlendSearch
+                 is used. BlendSearch can be tried when the search space is
+                 complex, for example, containing multiple disjoint, discontinuous
+                 subspaces. When set to 'random', random search is used.
+             starting_points: A dictionary or a str to specify the starting hyperparameter
+                 config for the estimators | default="static".
+                 If str:
+                     - if "data", use data-dependent defaults;
+                     - if "data:path" use data-dependent defaults which are stored at path;
+                     - if "static", use data-independent defaults.
+                 If dict, keys are the name of the estimators, and values are the starting
+                 hyperparamter configurations for the corresponding estimators.
+                 The value can be a single hyperparamter configuration dict or a list
+                 of hyperparamter configuration dicts.
+                 In the following code example, we get starting_points from the
+                 `automl` object and use them in the `new_automl` object.
+                 e.g.,
+
+         ```python
+         from flaml import AutoML
+         automl = AutoML()
+         X_train, y_train = load_iris(return_X_y=True)
+         automl.fit(X_train, y_train)
+         starting_points = automl.best_config_per_estimator
+
+         new_automl = AutoML()
+         new_automl.fit(X_train, y_train, starting_points=starting_points)
+         ```
+
+             seed: int or None, default=None | The random seed for hpo.
+             n_concurrent_trials: [Experimental] int, default=1 | The number of
+                 concurrent trials. When n_concurrent_trials > 1, flaml performes
+                 [parallel tuning](https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning)
+                 and installation of ray is required: `pip install flaml[ray]`.
+             keep_search_state: boolean, default=False | Whether to keep data needed
+                 for model search after fit(). By default the state is deleted for
+                 space saving.
+             early_stop: boolean, default=False | Whether to stop early if the
+                 search is considered to converge.
+             append_log: boolean, default=False | Whetehr to directly append the log
+                 records to the input log file if it exists.
+             auto_augment: boolean, default=True | Whether to automatically
+                 augment rare classes.
+             min_sample_size: int, default=MIN_SAMPLE_TRAIN | the minimal sample
+                 size when sample=True.
+             use_ray: boolean, default=False | Whether to use ray to run the training
+                 in separate processes. This can be used to prevent OOM for large
+                 datasets, but will incur more overhead in time. Only use it if
+                 you run into OOM failures.
+             metric_constraints: list, default=[] | The list of metric constraints.
+                 Each element in this list is a 3-tuple, which shall be expressed
+                 in the following format: the first element of the 3-tuple is the name of the
+                 metric, the second element is the inequality sign chosen from ">=" and "<=",
+                 and the third element is the constraint value. E.g., `('val_loss', '<=', 0.1)`.
+                 Note that all the metric names in metric_constraints need to be reported via
+                 the metrics_to_log dictionary returned by a customized metric function.
+                 The customized metric function shall be provided via the `metric` key word
+                 argument of the fit() function or the automl constructor.
+                 Find an example in the 4th constraint type in this [doc](https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML#constraint).
+                 If `pred_time_limit` is provided as one of keyword arguments to fit() function or
+                 the automl constructor, flaml will automatically (and under the hood)
+                 add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'
+                 specifies a constraint about the prediction latency constraint in seconds.
+             custom_hp: dict, default=None | The custom search space specified by user
+                 Each key is the estimator name, each value is a dict of the custom search space for that estimator. Notice the
+                 domain of the custom search space can either be a value of a sample.Domain object.
+                 e.g.,

        ```python
-        from flaml import AutoML
-        automl = AutoML()
-        X_train, y_train = load_iris(return_X_y=True)
-        automl.fit(X_train, y_train)
-        starting_points = automl.best_config_per_estimator
+        custom_hp = {
+             "transformer_ms": {
+                 "model_path": {
+                     "domain": "albert-base-v2",
+                 },
+                 "learning_rate": {
+                     "domain": tune.choice([1e-4, 1e-5]),
+                 }
+             }
+         }
+         ```
+             fit_kwargs_by_estimator: dict, default=None | The user specified keywords arguments, grouped by estimator name.
+                 e.g.,

-        new_automl = AutoML()
-        new_automl.fit(X_train, y_train, starting_points=starting_points)
-        ```
-
-            seed: int or None, default=None | The random seed for hpo.
-            n_concurrent_trials: [Experimental] int, default=1 | The number of
-                concurrent trials. When n_concurrent_trials > 1, flaml performes
-                [parallel tuning](https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning)
-                and installation of ray is required: `pip install flaml[ray]`.
-            keep_search_state: boolean, default=False | Whether to keep data needed
-                for model search after fit(). By default the state is deleted for
-                space saving.
-            early_stop: boolean, default=False | Whether to stop early if the
-                search is considered to converge.
-            append_log: boolean, default=False | Whetehr to directly append the log
-                records to the input log file if it exists.
-            auto_augment: boolean, default=True | Whether to automatically
-                augment rare classes.
-            min_sample_size: int, default=MIN_SAMPLE_TRAIN | the minimal sample
-                size when sample=True.
-            use_ray: boolean, default=False | Whether to use ray to run the training
-                in separate processes. This can be used to prevent OOM for large
-                datasets, but will incur more overhead in time. Only use it if
-                you run into OOM failures.
-            metric_constraints: list, default=[] | The list of metric constraints.
-                Each element in this list is a 3-tuple, which shall be expressed
-                in the following format: the first element of the 3-tuple is the name of the
-                metric, the second element is the inequality sign chosen from ">=" and "<=",
-                and the third element is the constraint value. E.g., `('val_loss', '<=', 0.1)`.
-                Note that all the metric names in metric_constraints need to be reported via
-                the metrics_to_log dictionary returned by a customized metric function.
-                The customized metric function shall be provided via the `metric` key word
-                argument of the fit() function or the automl constructor.
-                Find an example in the 4th constraint type in this [doc](https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML#constraint).
-                If `pred_time_limit` is provided as one of keyword arguments to fit() function or
-                the automl constructor, flaml will automatically (and under the hood)
-                add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'
-                specifies a constraint about the prediction latency constraint in seconds.
+         ```python
+         fit_kwargs_by_estimator = {
+             "transformer": {
+                 "output_dir": "test/data/output/",
+                 "ckpt_per_epoch": 1,
+                 "fp16": False,
+             }
+         }
+         ```

        """
        self._track_iter = 0
@@ -585,6 +707,11 @@ class AutoML(BaseEstimator):
        settings["min_sample_size"] = settings.get("min_sample_size", MIN_SAMPLE_TRAIN)
        settings["use_ray"] = settings.get("use_ray", False)
        settings["metric_constraints"] = settings.get("metric_constraints", [])
+        settings["fit_kwargs_by_estimator"] = settings.get(
+            "fit_kwargs_by_estimator", {}
+        )
+        settings["custom_hp"] = settings.get("custom_hp", {})
+
        self._estimator_type = (
            "classifier" if settings["task"] in CLASSIFICATION else "regressor"
        )
@@ -919,7 +1046,7 @@ class AutoML(BaseEstimator):
        else:
            raise ValueError("either X_train+y_train or dataframe+label are required")

-        # check the validity of input dimensions under the nlp mode
+        # check the validity of input dimensions for NLP tasks, so need to check _is_nlp_task not estimator
        if _is_nlp_task(self._state.task):
            from .nlp.utils import is_a_list_of_str

@@ -966,7 +1093,10 @@ class AutoML(BaseEstimator):
                X, y, self._state.task
            )
            self._label_transformer = self._transformer.label_transformer
-        self._sample_weight_full = self._state.fit_kwargs.get("sample_weight")
+
+        self._sample_weight_full = self._state.fit_kwargs.get(
+            "sample_weight"
+        )  # NOTE: _validate_data is before,
        if X_val is not None and y_val is not None:
            assert (
                isinstance(X_val, np.ndarray)
@@ -1026,7 +1156,8 @@ class AutoML(BaseEstimator):
        if (
            self._state.task in CLASSIFICATION
            and self._auto_augment
-            and self._state.fit_kwargs.get("sample_weight") is None
+            and self._state.fit_kwargs.get("sample_weight")
+            is None  # NOTE: _prepare_data is before
            and self._split_type in ["stratified", "uniform"]
            and self._state.task != TOKENCLASSIFICATION
        ):
@@ -1068,7 +1199,9 @@ class AutoML(BaseEstimator):
                    self._sample_weight_full,
                    random_state=RANDOM_SEED,
                )
-                self._state.fit_kwargs["sample_weight"] = self._state.sample_weight_all
+                self._state.fit_kwargs[
+                    "sample_weight"
+                ] = self._state.sample_weight_all  # NOTE: _prepare_data is before
            else:
                X_train_all, y_train_all = shuffle(
                    X_train_all, y_train_all, random_state=RANDOM_SEED
@@ -1085,7 +1218,9 @@ class AutoML(BaseEstimator):
            if self._split_type == "time":
                if self._state.task in TS_FORECAST:
                    num_samples = X_train_all.shape[0]
-                    period = self._state.fit_kwargs["period"]
+                    period = self._state.fit_kwargs[
+                        "period"
+                    ]  # NOTE: _prepare_data is before
                    assert (
                        period < num_samples
                    ), f"period={period}>#examples={num_samples}"
@@ -1095,18 +1230,24 @@ class AutoML(BaseEstimator):
                    X_val = X_train_all[split_idx:]
                    y_val = y_train_all[split_idx:]
                else:
-                    if "sample_weight" in self._state.fit_kwargs:
+                    if (
+                        "sample_weight" in self._state.fit_kwargs
+                    ):  # NOTE: _prepare_data is before
                        (
                            X_train,
                            X_val,
                            y_train,
                            y_val,
-                            self._state.fit_kwargs["sample_weight"],
+                            self._state.fit_kwargs[
+                                "sample_weight"
+                            ],  # NOTE: _prepare_data is before
                            self._state.weight_val,
                        ) = train_test_split(
                            X_train_all,
                            y_train_all,
-                            self._state.fit_kwargs["sample_weight"],
+                            self._state.fit_kwargs[
+                                "sample_weight"
+                            ],  # NOTE: _prepare_data is before
                            test_size=split_ratio,
                            shuffle=False,
                        )
@@ -1147,7 +1288,9 @@ class AutoML(BaseEstimator):
                X_rest = X_train_all.iloc[rest] if self._df else X_train_all[rest]
                y_rest = y_train_all[rest]
                stratify = y_rest if self._split_type == "stratified" else None
-                if "sample_weight" in self._state.fit_kwargs:
+                if (
+                    "sample_weight" in self._state.fit_kwargs
+                ):  # NOTE: _prepare_data is before
                    (
                        X_train,
                        X_val,
@@ -1158,13 +1301,19 @@ class AutoML(BaseEstimator):
                    ) = train_test_split(
                        X_rest,
                        y_rest,
-                        self._state.fit_kwargs["sample_weight"][rest],
+                        self._state.fit_kwargs["sample_weight"][
+                            rest
+                        ],  # NOTE: _prepare_data is before
                        test_size=split_ratio,
                        random_state=RANDOM_SEED,
                    )
-                    weight1 = self._state.fit_kwargs["sample_weight"][first]
+                    weight1 = self._state.fit_kwargs["sample_weight"][
+                        first
+                    ]  # NOTE: _prepare_data is before
                    self._state.weight_val = concat(weight1, weight_val)
-                    self._state.fit_kwargs["sample_weight"] = concat(
+                    self._state.fit_kwargs[
+                        "sample_weight"
+                    ] = concat(  # NOTE: _prepare_data is before
                        weight1, weight_train
                    )
                else:
@@ -1188,18 +1337,24 @@ class AutoML(BaseEstimator):
                    else np.concatenate([label_set, y_val])
                )
            elif self._state.task in REGRESSION:
-                if "sample_weight" in self._state.fit_kwargs:
+                if (
+                    "sample_weight" in self._state.fit_kwargs
+                ):  # NOTE: _prepare_data is before
                    (
                        X_train,
                        X_val,
                        y_train,
                        y_val,
-                        self._state.fit_kwargs["sample_weight"],
+                        self._state.fit_kwargs[
+                            "sample_weight"
+                        ],  # NOTE: _prepare_data is before
                        self._state.weight_val,
                    ) = train_test_split(
                        X_train_all,
                        y_train_all,
-                        self._state.fit_kwargs["sample_weight"],
+                        self._state.fit_kwargs[
+                            "sample_weight"
+                        ],  # NOTE: _prepare_data is before
                        test_size=split_ratio,
                        random_state=RANDOM_SEED,
                    )
@@ -1245,7 +1400,9 @@ class AutoML(BaseEstimator):
        elif self._split_type == "time":
            # logger.info("Using TimeSeriesSplit")
            if self._state.task in TS_FORECAST:
-                period = self._state.fit_kwargs["period"]
+                period = self._state.fit_kwargs[
+                    "period"
+                ]  # NOTE: _prepare_data is before
                if period * (n_splits + 1) > y_train_all.size:
                    n_splits = int(y_train_all.size / period - 1)
                    assert n_splits >= 2, (
@@ -1324,6 +1481,8 @@ class AutoML(BaseEstimator):
        train_full=False,
        record_id=-1,
        auto_augment=None,
+        custom_hp=None,
+        fit_kwargs_by_estimator=None,
        **fit_kwargs,
    ):
        """Retrain from log file.
@@ -1381,6 +1540,34 @@ class AutoML(BaseEstimator):
                when `record_id >= 0`, `time_budget` will be ignored.
            auto_augment: boolean, default=True | Whether to automatically
                augment rare classes.
+            custom_hp: dict, default=None | The custom search space specified by user
+                Each key is the estimator name, each value is a dict of the custom search space for that estimator. Notice the
+                domain of the custom search space can either be a value or a sample.Domain object.
+
+        ```python
+        custom_hp = {
+            "transformer_ms": {
+                "model_path": {
+                    "domain": "albert-base-v2",
+                },
+                "learning_rate": {
+                    "domain": tune.choice([1e-4, 1e-5]),
+                }
+            }
+        }
+            fit_kwargs_by_estimator: dict, default=None | The user specified keywords arguments, grouped by estimator name.
+                e.g.,
+
+        ```python
+        fit_kwargs_by_estimator = {
+            "transformer": {
+                "output_dir": "test/data/output/",
+                "ckpt_per_epoch": 1,
+                "fp16": False,
+            }
+        }
+        ```
+
            **fit_kwargs: Other key word arguments to pass to fit() function of
                the searched learners, such as sample_weight.
        """
@@ -1396,6 +1583,10 @@ class AutoML(BaseEstimator):
        self._estimator_type = "classifier" if task in CLASSIFICATION else "regressor"

        self._state.fit_kwargs = fit_kwargs
+        self._state.custom_hp = custom_hp or self._settings.get("custom_hp")
+        self._state.fit_kwargs_by_estimator = (
+            fit_kwargs_by_estimator or self._settings.get("fit_kwargs_by_estimator")
+        )
        self._validate_data(X_train, y_train, dataframe, label, groups=groups)

        logger.info("log file name {}".format(log_file_name))
@@ -1443,6 +1634,16 @@ class AutoML(BaseEstimator):
        best_config = best.config
        sample_size = len(self._y_train_all) if train_full else best.sample_size

+        this_estimator_kwargs = self._state.fit_kwargs_by_estimator.get(best_estimator)
+        if this_estimator_kwargs:
+            this_estimator_kwargs = (
+                this_estimator_kwargs.copy()
+            )  # make another shallow copy of the value (a dict obj), so user's fit_kwargs_by_estimator won't be updated
+            this_estimator_kwargs.update(self._state.fit_kwargs)
+            self._state.fit_kwargs_by_estimator[best_estimator] = this_estimator_kwargs
+        else:
+            self._state.fit_kwargs_by_estimator[best_estimator] = self._state.fit_kwargs
+
        logger.info(
            "estimator = {}, config = {}, #training instances = {}".format(
                best_estimator, best_config, sample_size
@@ -1498,8 +1699,10 @@ class AutoML(BaseEstimator):
        elif self._state.task in TS_FORECAST:
            assert split_type in ["auto", "time"]
            self._split_type = "time"
+
            assert isinstance(
-                self._state.fit_kwargs.get("period"), int
+                self._state.fit_kwargs.get("period"),
+                int,  # NOTE: _decide_split_type is before
            ), f"missing a required integer 'period' for '{TS_FORECAST}' task."
        elif self._state.task == "rank":
            assert (
@@ -1782,6 +1985,8 @@ class AutoML(BaseEstimator):
        min_sample_size=None,
        use_ray=None,
        metric_constraints=None,
+        custom_hp=None,
+        fit_kwargs_by_estimator=None,
        **fit_kwargs,
    ):
        """Find a model for a given task.
@@ -1809,6 +2014,7 @@ class AutoML(BaseEstimator):
                'mape'. Default is 'auto'.
                If passing a customized metric function, the function needs to
                have the following signature:
+
        ```python
        def custom_metric(
            X_test, y_test, estimator, labels,
@@ -1819,6 +2025,7 @@ class AutoML(BaseEstimator):
        ```
                which returns a float number as the minimization objective,
                and a dictionary as the metrics to log. E.g.,
+
        ```python
        def custom_metric(
            X_val, y_val, estimator, labels,
@@ -1975,6 +2182,34 @@ class AutoML(BaseEstimator):
                the automl constructor, flaml will automatically (and under the hood)
                add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'
                specifies a constraint about the prediction latency constraint in seconds.
+            custom_hp: dict, default=None | The custom search space specified by user
+                Each key is the estimator name, each value is a dict of the custom search space for that estimator. Notice the
+                domain of the custom search space can either be a value of a sample.Domain object.
+
+        ```python
+        custom_hp = {
+            "transformer_ms": {
+                "model_path": {
+                    "domain": "albert-base-v2",
+                },
+                "learning_rate": {
+                    "domain": tune.choice([1e-4, 1e-5]),
+                }
+            }
+        }
+            fit_kwargs_by_estimator: dict, default=None | The user specified keywords arguments, grouped by estimator name.
+                e.g.,
+
+        ```python
+        fit_kwargs_by_estimator = {
+            "transformer": {
+                "output_dir": "test/data/output/",
+                "ckpt_per_epoch": 1,
+                "fp16": False,
+            }
+        }
+        ```
+
            **fit_kwargs: Other key word arguments to pass to fit() function of
                the searched learners, such as sample_weight. Include:
                    period: int | forecast horizon for ts_forecast tasks.
@@ -2085,6 +2320,13 @@ class AutoML(BaseEstimator):
        self._state.log_training_metric = log_training_metric

        self._state.fit_kwargs = fit_kwargs
+        custom_hp = custom_hp or self._settings.get("custom_hp")
+        fit_kwargs_by_estimator = fit_kwargs_by_estimator or self._settings.get(
+            "fit_kwargs_by_estimator"
+        )
+        self._state.fit_kwargs_by_estimator = (
+            fit_kwargs_by_estimator.copy()
+        )  # shallow copy of fit_kwargs_by_estimator
        self._state.weight_val = sample_weight_val

        self._validate_data(
@@ -2268,15 +2510,35 @@ class AutoML(BaseEstimator):
                pass

        starting_points = {} if starting_points == "static" else starting_points
+
        for estimator_name in estimator_list:
            estimator_class = self._state.learner_classes[estimator_name]
            estimator_class.init()
+            this_estimator_kwargs = self._state.fit_kwargs_by_estimator.get(
+                estimator_name
+            )
+            if this_estimator_kwargs:
+                # make another shallow copy of the value (a dict obj), so user's fit_kwargs_by_estimator won't be updated
+                this_estimator_kwargs = this_estimator_kwargs.copy()
+                this_estimator_kwargs.update(
+                    self._state.fit_kwargs
+                )  # update the shallow copy
+                self._state.fit_kwargs_by_estimator[
+                    estimator_name
+                ] = this_estimator_kwargs  # set self._state.fit_kwargs_by_estimator[estimator_name] to the update, so only self._state.fit_kwargs_by_estimator will be updated
+            else:
+                self._state.fit_kwargs_by_estimator[
+                    estimator_name
+                ] = self._state.fit_kwargs
+
            self._search_states[estimator_name] = SearchState(
                learner_class=estimator_class,
                data_size=self._state.data_size,
                task=self._state.task,
                starting_point=starting_points.get(estimator_name),
-                period=self._state.fit_kwargs.get("period"),
+                period=self._state.fit_kwargs.get("period"),  # NOTE: this is after
+                custom_hp=custom_hp and custom_hp.get(estimator_name),
+                max_iter=max_iter,
            )
        logger.info("List of ML learners in AutoML Run: {}".format(estimator_list))
        self.estimator_list = estimator_list
@@ -2332,7 +2594,11 @@ class AutoML(BaseEstimator):
            del self._X_train_all, self._y_train_all, self._state.kf
            del self._state.X_train, self._state.X_train_all, self._state.X_val
            del self._state.y_train, self._state.y_train_all, self._state.y_val
-            del self._sample_weight_full, self._state.fit_kwargs
+            del (
+                self._sample_weight_full,
+                self._state.fit_kwargs_by_estimator,
+                self._state.fit_kwargs,
+            )  # NOTE: this is after
            del self._state.groups, self._state.groups_all, self._state.groups_val
        logger.setLevel(old_level)

@@ -2915,13 +3181,18 @@ class AutoML(BaseEstimator):
                    n_jobs=self._state.n_jobs,
                    passthrough=passthrough,
                )
-                if self._sample_weight_full is not None:
-                    self._state.fit_kwargs["sample_weight"] = self._sample_weight_full
+                sample_weight_dict = (
+                    (self._sample_weight_full is not None)
+                    and {"sample_weight": self._sample_weight_full}
+                    or {}
+                )
                for e in estimators:
                    e[1].__class__.init()
                try:
                    stacker.fit(
-                        self._X_train_all, self._y_train_all, **self._state.fit_kwargs
+                        self._X_train_all,
+                        self._y_train_all,
+                        **sample_weight_dict,  # NOTE: _search is after
                    )
                    logger.info(f"ensemble: {stacker}")
                    self._trained_estimator = stacker
@@ -2940,7 +3211,7 @@ class AutoML(BaseEstimator):
                        stacker.fit(
                            self._X_train_all,
                            self._y_train_all,
-                            **self._state.fit_kwargs,
+                            **sample_weight_dict,  # NOTE: _search is after
                        )
                        logger.info(f"ensemble: {stacker}")
                        self._trained_estimator = stacker
--- a/flaml/default/all/binary.json
+++ b/flaml/default/all/binary.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "lgbm",
--- a/flaml/default/all/multiclass.json
+++ b/flaml/default/all/multiclass.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "lgbm",
--- a/flaml/default/all/regression.json
+++ b/flaml/default/all/regression.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "lgbm",
--- a/flaml/default/extra_tree/binary.json
+++ b/flaml/default/extra_tree/binary.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "extra_tree",
--- a/flaml/default/extra_tree/multiclass.json
+++ b/flaml/default/extra_tree/multiclass.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "extra_tree",
--- a/flaml/default/extra_tree/regression.json
+++ b/flaml/default/extra_tree/regression.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "extra_tree",
--- a/flaml/default/lgbm/binary.json
+++ b/flaml/default/lgbm/binary.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "lgbm",
--- a/flaml/default/lgbm/multiclass.json
+++ b/flaml/default/lgbm/multiclass.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "lgbm",
--- a/flaml/default/lgbm/regression.json
+++ b/flaml/default/lgbm/regression.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "lgbm",
--- a/flaml/default/portfolio.py
+++ b/flaml/default/portfolio.py
@@ -113,8 +113,11 @@ def serialize(configs, regret, meta_features, output_file, config_path):
    )
    portfolio = [load_json(config_path.joinpath(m + ".json")) for m in configs]
    regret = regret.loc[configs]
+    from flaml import __version__
+
    meta_predictor = {
-        "version": "default",
+        "version": __version__,
+        "meta_feature_names": list(meta_features.columns),
        "portfolio": portfolio,
        "preprocessing": proc,
        "neighbors": [
--- a/flaml/default/rf/binary.json
+++ b/flaml/default/rf/binary.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "rf",
--- a/flaml/default/rf/multiclass.json
+++ b/flaml/default/rf/multiclass.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "rf",
--- a/flaml/default/rf/regression.json
+++ b/flaml/default/rf/regression.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "rf",
--- a/flaml/default/suggest.py
+++ b/flaml/default/suggest.py
@@ -11,13 +11,27 @@ logger = logging.getLogger(__name__)
 CONFIG_PREDICTORS = {}


-def meta_feature(task, X_train, y_train):
-    is_classification = task in CLASSIFICATION
+def meta_feature(task, X_train, y_train, meta_feature_names):
+    this_feature = []
    n_row = X_train.shape[0]
    n_feat = X_train.shape[1]
-    n_class = len(np.unique(y_train)) if is_classification else 0
-    percent_num = X_train.select_dtypes(include=np.number).shape[1] / n_feat
-    return (n_row, n_feat, n_class, percent_num)
+
+    is_classification = task in CLASSIFICATION
+    for each_feature_name in meta_feature_names:
+        if each_feature_name == "NumberOfInstances":
+            this_feature.append(n_row)
+        elif each_feature_name == "NumberOfFeatures":
+            this_feature.append(n_feat)
+        elif each_feature_name == "NumberOfClasses":
+            this_feature.append(len(np.unique(y_train)) if is_classification else 0)
+        elif each_feature_name == "PercentageOfNumericFeatures":
+            this_feature.append(
+                X_train.select_dtypes(include=np.number).shape[1] / n_feat
+            )
+        else:
+            raise ValueError("Feature {} not implemented. ".format(each_feature_name))
+
+    return this_feature


 def load_config_predictor(estimator_name, task, location=None):
@@ -53,9 +67,15 @@ def suggest_config(task, X, y, estimator_or_predictor, location=None, k=None):
        if isinstance(estimator_or_predictor, str)
        else estimator_or_predictor
    )
-    assert predictor["version"] == "default"
+    from flaml import __version__
+
+    older_version = "1.0.2"
+    # TODO: update older_version when the newer code can no longer handle the older version json file
+    assert __version__ >= predictor["version"] >= older_version
    prep = predictor["preprocessing"]
-    feature = meta_feature(task, X, y)
+    feature = meta_feature(
+        task, X_train=X, y_train=y, meta_feature_names=predictor["meta_feature_names"]
+    )
    feature = (np.array(feature) - np.array(prep["center"])) / np.array(prep["scale"])
    neighbors = predictor["neighbors"]
    nn = NearestNeighbors(n_neighbors=1)
@@ -211,13 +231,16 @@ def preprocess_and_suggest_hyperparams(
    model_class = get_estimator_class(task, estimator)
    hyperparams = config["hyperparameters"]
    model = model_class(task=task, **hyperparams)
-    estimator_class = model.estimator_class
-    X = model._preprocess(X)
-    hyperparams = hyperparams and model.params
+    if model.estimator_class is None:
+        return hyperparams, model_class, X, y, None, None
+    else:
+        estimator_class = model.estimator_class
+        X = model._preprocess(X)
+        hyperparams = hyperparams and model.params

-    class AutoMLTransformer:
-        def transform(self, X):
-            return model._preprocess(dt.transform(X))
+        class AutoMLTransformer:
+            def transform(self, X):
+                return model._preprocess(dt.transform(X))

-    transformer = AutoMLTransformer()
-    return hyperparams, estimator_class, X, y, transformer, dt.label_transformer
+        transformer = AutoMLTransformer()
+        return hyperparams, estimator_class, X, y, transformer, dt.label_transformer
--- a/flaml/default/xgb_limitdepth/binary.json
+++ b/flaml/default/xgb_limitdepth/binary.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "xgb_limitdepth",
--- a/flaml/default/xgb_limitdepth/multiclass.json
+++ b/flaml/default/xgb_limitdepth/multiclass.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "xgb_limitdepth",
--- a/flaml/default/xgb_limitdepth/regression.json
+++ b/flaml/default/xgb_limitdepth/regression.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "xgb_limitdepth",
--- a/flaml/default/xgboost/binary.json
+++ b/flaml/default/xgboost/binary.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "xgboost",
--- a/flaml/default/xgboost/multiclass.json
+++ b/flaml/default/xgboost/multiclass.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "xgboost",
--- a/flaml/default/xgboost/regression.json
+++ b/flaml/default/xgboost/regression.json
@@ -1,5 +1,8 @@
 {
-    "version": "default",
+    "version": "1.0.2",
+    "meta_feature_names": [
+        "NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
+    ],
    "portfolio": [
        {
            "class": "xgboost",
--- a/flaml/ml.py
+++ b/flaml/ml.py
@@ -37,6 +37,7 @@ from .model import (
    ARIMA,
    SARIMAX,
    TransformersEstimator,
+    TransformersEstimatorModelSelection,
 )
 from .data import CLASSIFICATION, group_counts, TS_FORECAST, TS_VALUE_COL
 import logging
@@ -121,6 +122,8 @@ def get_estimator_class(task, estimator_name):
        estimator_class = SARIMAX
    elif estimator_name == "transformer":
        estimator_class = TransformersEstimator
+    elif estimator_name == "transformer_ms":
+        estimator_class = TransformersEstimatorModelSelection
    else:
        raise ValueError(
            estimator_name + " is not a built-in learner. "
@@ -536,10 +539,6 @@ def evaluate_model_CV(
        else:
            metric = total_metric / n
    pred_time /= n
-    # budget -= time.time() - start_time
-    # if val_loss < best_val_loss and budget > budget_per_train:
-    #     estimator.cleanup()
-    #     estimator.fit(X_train_all, y_train_all, budget, **fit_kwargs)
    return val_loss, metric, train_time, pred_time


@@ -605,6 +604,10 @@ def compute_estimator(
            log_training_metric=log_training_metric,
            fit_kwargs=fit_kwargs,
        )
+
+    if isinstance(estimator, TransformersEstimator):
+        del fit_kwargs["metric"], fit_kwargs["X_val"], fit_kwargs["y_val"]
+
    return estimator, val_loss, metric_for_logging, train_time, pred_time


--- a/flaml/model.py
+++ b/flaml/model.py
@@ -354,10 +354,14 @@ class TransformersEstimator(BaseEstimator):
        import uuid

        self.trial_id = str(uuid.uuid1().hex)[:8]
-        if task in NLG_TASKS:
-            from transformers import Seq2SeqTrainingArguments as TrainingArguments
+        if task not in NLG_TASKS:  # TODO: not in NLG_TASKS
+            from .nlp.huggingface.training_args import (
+                TrainingArgumentsForAuto as TrainingArguments,
+            )
        else:
-            from transformers import TrainingArguments
+            from .nlp.huggingface.training_args import (
+                Seq2SeqTrainingArgumentsForAuto as TrainingArguments,
+            )
        self._TrainingArguments = TrainingArguments

    @staticmethod
@@ -403,32 +407,72 @@ class TransformersEstimator(BaseEstimator):

        return search_space_dict

-    def _init_hf_args(self, automl_fit_kwargs: dict = None):
-        from .nlp.utils import HFArgs
+    @property
+    def checkpoint_freq(self):
+        return (
+            int(
+                min(self._training_args.num_train_epochs, 1)
+                * len(self._X_train)
+                / self._training_args.per_device_train_batch_size
+                / self._training_args.ckpt_per_epoch
+            )
+            + 1
+        )

-        hf_args = HFArgs(task=self._task)
-        fit_kwargs = automl_fit_kwargs.get("hf_args", None)
-        if fit_kwargs:
-            for key, val in fit_kwargs.items():
-                assert (
-                    key in hf_args.__dict__
-                ), "The specified key {} is not in the argument list of flaml.nlp.utils::HFArgs".format(
-                    key
+    @property
+    def fp16(self):
+        return self._kwargs.get("gpu_per_trial") and self._training_args.fp16
+
+    @property
+    def no_cuda(self):
+        return not self._kwargs.get("gpu_per_trial")
+
+    def _set_training_args(self, **kwargs):
+        from .nlp.utils import date_str, Counter
+
+        for (key, val) in kwargs.items():
+            assert key not in self.params, (
+                "Since {} is in the search space, it cannot exist in 'custom_fit_kwargs' at the same time."
+                "If you need to fix the value of {} to {}, the only way is to add a single-value domain in the search "
+                "space by adding:\n '{}': {{ 'domain': {} }} to 'custom_hp'. For example:"
+                'automl_settings["custom_hp"] = {{ "transformer": {{ "model_path": {{ "domain" : '
+                '"google/electra-small-discriminator" }} }} }}'.format(
+                    key, key, val, key, val
                )
-                setattr(hf_args, key, val)
-        self.hf_args = hf_args
+            )

-    def _update_hf_args(self, automl_pred_kwargs: dict = None):
-        if automl_pred_kwargs:
-            hf_args = automl_pred_kwargs.get("hf_args")
-            if hf_args:
-                for key, val in hf_args.items():
-                    assert (
-                        key in self.hf_args.__dict__
-                    ), "The specified key {} is not in the argument list of flaml.nlp.utils::HFArgs".format(
-                        key
-                    )
-                    setattr(self.hf_args, key, val)
+        """
+            If use has specified any custom args for TrainingArguments, update these arguments
+        """
+        self._training_args = self._TrainingArguments(**kwargs)
+
+        """
+            Update the attributes in TrainingArguments with self.params values
+        """
+        for key, val in self.params.items():
+            if hasattr(self._training_args, key):
+                setattr(self._training_args, key, val)
+
+        """
+            Update the attributes in TrainingArguments that depends on the values of self.params
+        """
+        local_dir = os.path.join(
+            self._training_args.output_dir, "train_{}".format(date_str())
+        )
+        if self._use_ray is True:
+            import ray
+
+            self._training_args.output_dir = ray.tune.get_trial_dir()
+        else:
+            self._training_args.output_dir = Counter.get_trial_fold_name(
+                local_dir, self.params, self.trial_id
+            )
+
+        self._training_args.eval_steps = (
+            self._training_args.logging_steps
+        ) = self._training_args.saving_steps = self.checkpoint_freq
+        self._training_args.fp16 = self.fp16
+        self._training_args.no_cuda = self.no_cuda

    def _preprocess(self, X, y=None, **kwargs):
        from .nlp.utils import tokenize_text, is_a_list_of_str
@@ -441,95 +485,121 @@ class TransformersEstimator(BaseEstimator):
                X=X,
                Y=y,
                task=self._task,
-                hf_args=self.hf_args,
-                tokenizer=self._tokenizer,
+                hf_args=self._training_args,
+                tokenizer=self.tokenizer,
            )
        else:
            return X, None

-    def _model_init(self, num_labels, per_model_config):
+    def _model_init(self):
        from .nlp.utils import load_model

        this_model = load_model(
-            checkpoint_path=self.hf_args.model_path,
+            checkpoint_path=self._training_args.model_path,
            task=self._task,
-            num_labels=num_labels,
-            per_model_config=per_model_config,
+            num_labels=self.num_labels,
        )
        return this_model

-    def _get_training_args(self, local_rank=-1):
-        import transformers
+    def preprocess_data(self, X, y):
+        from datasets import Dataset

-        if self._task in NLG_TASKS:
-            self._training_args_config["predict_with_generate"] = True
+        if (self._task not in NLG_TASKS) and (self._task != TOKENCLASSIFICATION):
+            processed_X, _ = self._preprocess(X=X, **self._kwargs)
+            processed_y = y
+        else:
+            processed_X, processed_y = self._preprocess(X=X, y=y, **self._kwargs)

-        if transformers.__version__.startswith("3"):
-            training_args = self._TrainingArguments(
-                report_to=[],
-                output_dir=self._trial_dir,
-                do_train=True,
-                do_eval=True,
-                eval_steps=self._ckpt_freq,
-                evaluate_during_training=True,
-                save_steps=self._ckpt_freq,
-                logging_steps=self._ckpt_freq,
-                save_total_limit=0,
-                metric_for_best_model="loss",
-                fp16=self.hf_args.fp16
-                if self._kwargs.get("gpu_per_trial") > 0
-                else False,
-                no_cuda=True if self._kwargs.get("gpu_per_trial") == 0 else False,
-                local_rank=local_rank,
-                per_device_eval_batch_size=self.hf_args.per_device_eval_batch_size,
-                **self._training_args_config,
+        processed_dataset = Dataset.from_pandas(
+            TransformersEstimator._join(processed_X, processed_y)
+        )
+        return processed_dataset, processed_X, processed_y
+
+    @property
+    def num_labels(self):
+        from .data import SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION
+
+        if self._task == SEQREGRESSION:
+            return 1
+        elif self._task == SEQCLASSIFICATION:
+            return len(set(self._y_train))
+        elif self._task == TOKENCLASSIFICATION:
+            return len(set([a for b in self._y_train.tolist() for a in b]))
+        else:
+            return None
+
+    @property
+    def tokenizer(self):
+        from transformers import AutoTokenizer
+
+        if self._task == SUMMARIZATION:
+            return AutoTokenizer.from_pretrained(
+                pretrained_model_name_or_path=self._training_args.model_path,
+                cache_dir=None,
+                use_fast=True,
+                revision="main",
+                use_auth_token=None,
            )
        else:
-            from transformers import IntervalStrategy
-
-            training_args = self._TrainingArguments(
-                report_to=[],
-                output_dir=self._trial_dir,
-                do_train=True,
-                do_eval=True,
-                eval_steps=self._ckpt_freq,
-                logging_steps=self._ckpt_freq,
-                evaluation_strategy=IntervalStrategy.STEPS,
-                save_steps=self._ckpt_freq,
-                save_total_limit=0,
-                metric_for_best_model="loss",
-                fp16=self.hf_args.fp16
-                if self._kwargs.get("gpu_per_trial") > 0
-                else False,
-                local_rank=local_rank,
-                no_cuda=True if self._kwargs.get("gpu_per_trial") == 0 else False,
-                per_device_eval_batch_size=self.hf_args.per_device_eval_batch_size,
-                **self._training_args_config,
+            return AutoTokenizer.from_pretrained(
+                self._training_args.model_path, use_fast=True
            )
-        return training_args

-    def fit(self, X_train: DataFrame, y_train: Series, budget=None, **kwargs):
+    @property
+    def data_collator(self):
+        from .nlp.huggingface.data_collator import DataCollatorForAuto
+
+        return (
+            DataCollatorForAuto(
+                tokenizer=self.tokenizer,
+                pad_to_multiple_of=8 if self._training_args.fp16 else None,
+            )
+            if self._task == MULTICHOICECLASSIFICATION
+            else None
+        )
+
+    def fit(
+        self,
+        X_train: DataFrame,
+        y_train: Series,
+        budget=None,
+        X_val=None,
+        y_val=None,
+        gpu_per_trial=None,
+        metric=None,
+        **kwargs,
+    ):
        import transformers

        transformers.logging.set_verbosity_error()

        from transformers import TrainerCallback
        from transformers.trainer_utils import set_seed
-
-        from datasets import Dataset
-        from .nlp.utils import (
-            get_num_labels,
-            separate_config,
-            compute_checkpoint_freq,
-            Counter,
-            date_str,
-        )
-
        from .nlp.huggingface.trainer import TrainerForAuto
-        from .nlp.huggingface.data_collator import DataCollatorForAuto
-        from .nlp.utils import get_auto_tokenizer
+
+        try:
+            from ray.tune import is_session_enabled
+
+            self._use_ray = is_session_enabled()
+        except ImportError:
+            self._use_ray = False

        this_params = self.params
+        self._kwargs = kwargs
+
+        self._X_train, self._y_train = X_train, y_train
+        self._set_training_args(**kwargs)
+
+        train_dataset, self._X_train, self._y_train = self.preprocess_data(
+            X_train, y_train
+        )
+        if X_val is not None:
+            eval_dataset, self._X_val, self._y_val = self.preprocess_data(X_val, y_val)
+        else:
+            eval_dataset, self._X_val, self._y_val = None, None, None
+
+        set_seed(self.params.get("seed", self._training_args.seed))
+        self._metric = metric

        class EarlyStoppingCallbackForAuto(TrainerCallback):
            def on_train_begin(self, args, state, control, **callback_kwargs):
@@ -562,96 +632,13 @@ class TransformersEstimator(BaseEstimator):
                    control.should_save = True
                    control.should_evaluate = True

-        set_seed(self.params.get("seed", self._TrainingArguments.seed))
-
-        self._init_hf_args(kwargs)
-        self._tokenizer = get_auto_tokenizer(
-            self.hf_args.tokenizer_model_path
-            if self.hf_args.tokenizer_model_path
-            else self.hf_args.model_path,
-            self._task,
-        )
-        self._metric = kwargs["metric"]
-
-        try:
-            from ray.tune import is_session_enabled
-
-            self.use_ray = is_session_enabled()
-        except ImportError:
-            self.use_ray = False
-
-        X_val = kwargs.get("X_val")
-        y_val = kwargs.get("y_val")
-
-        if (self._task not in NLG_TASKS) and (self._task != TOKENCLASSIFICATION):
-            self._X_train, _ = self._preprocess(X=X_train, **kwargs)
-            self._y_train = y_train
-        else:
-            self._X_train, self._y_train = self._preprocess(
-                X=X_train, y=y_train, **kwargs
-            )
-
-        train_dataset = Dataset.from_pandas(
-            TransformersEstimator._join(self._X_train, self._y_train)
-        )
-
-        if X_val is not None:
-            if (self._task not in NLG_TASKS) and (self._task != TOKENCLASSIFICATION):
-                self._X_val, _ = self._preprocess(X=X_val, **kwargs)
-                self._y_val = y_val
-            else:
-                self._X_val, self._y_val = self._preprocess(X=X_val, y=y_val, **kwargs)
-            eval_dataset = Dataset.from_pandas(
-                TransformersEstimator._join(self._X_val, self._y_val)
-            )
-        else:
-            eval_dataset = None
-
-        num_labels = get_num_labels(self._task, self._y_train)
-        self._training_args_config, self._per_model_config = separate_config(
-            self.params, self._task
-        )
-        self._ckpt_freq = compute_checkpoint_freq(
-            train_data_size=len(self._X_train),
-            hf_args=self.hf_args,
-            num_train_epochs=self._training_args_config.get(
-                "num_train_epochs", self._TrainingArguments.num_train_epochs
-            ),
-            batch_size=self._training_args_config.get(
-                "per_device_train_batch_size",
-                self._TrainingArguments.per_device_train_batch_size,
-            ),
-        )
-
-        local_dir = os.path.join(self.hf_args.output_dir, "train_{}".format(date_str()))
-
-        if self.use_ray is True:
-            import ray
-
-            self._trial_dir = ray.tune.get_trial_dir()
-        else:
-            # if self.params = {}, don't include configuration in trial fold name
-            self._trial_dir = Counter.get_trial_fold_name(
-                local_dir, self.params, self.trial_id
-            )
-
-        self._kwargs = kwargs
-        self._num_labels = num_labels
-
-        training_args = self._get_training_args(local_rank=-1)
-
        self._trainer = TrainerForAuto(
-            args=training_args,
-            model_init=partial(self._model_init, num_labels, self._per_model_config),
+            args=self._training_args,
+            model_init=self._model_init,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
-            tokenizer=self._tokenizer,
-            data_collator=DataCollatorForAuto(
-                tokenizer=self._tokenizer,
-                pad_to_multiple_of=8 if training_args.fp16 else None,
-            )
-            if self._task == MULTICHOICECLASSIFICATION
-            else None,
+            tokenizer=self.tokenizer,
+            data_collator=self.data_collator,
            compute_metrics=self._compute_metrics_by_dataset_name,
            callbacks=[EarlyStoppingCallbackForAuto],
        )
@@ -659,17 +646,18 @@ class TransformersEstimator(BaseEstimator):
        if self._task in NLG_TASKS:
            setattr(self._trainer, "_is_seq2seq", True)

-        gpu_per_trial = kwargs.get("gpu_per_trial", None)
        """
            When not using ray for tuning, set the limit of CUDA_VISIBLE_DEVICES to math.ceil(gpu_per_trial),
            so each estimator does not see all the GPUs
        """
-        if gpu_per_trial:
+        if gpu_per_trial is not None:
            tmp_cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "")
            self._trainer.args._n_gpu = gpu_per_trial
+
            # if gpu_per_trial == 0:
            #     os.environ["CUDA_VISIBLE_DEVICES"] = ""
            if tmp_cuda_visible_devices.count(",") != math.ceil(gpu_per_trial) - 1:
+
                os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
                    [str(x) for x in range(math.ceil(gpu_per_trial))]
                )
@@ -679,7 +667,7 @@ class TransformersEstimator(BaseEstimator):
        start_time = time.time()
        self._trainer.train()

-        if gpu_per_trial:
+        if gpu_per_trial is not None:
            os.environ["CUDA_VISIBLE_DEVICES"] = tmp_cuda_visible_devices

        self.params[self.ITER_HP] = self._trainer.state.global_step
@@ -695,10 +683,11 @@ class TransformersEstimator(BaseEstimator):
                )
            ]
        self._trainer = None
+
        return time.time() - start_time

    def _delete_one_ckpt(self, ckpt_location):
-        if self.use_ray is False:
+        if self._use_ray is False:
            try:
                shutil.rmtree(ckpt_location)
            except FileNotFoundError:
@@ -743,11 +732,11 @@ class TransformersEstimator(BaseEstimator):
            if self._task in NLG_TASKS:
                if isinstance(predictions, tuple):
                    predictions = np.argmax(predictions[0], axis=2)
-                decoded_preds = self._tokenizer.batch_decode(
+                decoded_preds = self.tokenizer.batch_decode(
                    predictions, skip_special_tokens=True
                )
-                labels = np.where(labels != -100, labels, self._tokenizer.pad_token_id)
-                decoded_labels = self._tokenizer.batch_decode(
+                labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
+                decoded_labels = self.tokenizer.batch_decode(
                    labels, skip_special_tokens=True
                )
                predictions, labels = postprocess_text(decoded_preds, decoded_labels)
@@ -774,40 +763,41 @@ class TransformersEstimator(BaseEstimator):
                y_train=self._y_train,
            )
            metric_dict["automl_metric"] = loss
+
        return metric_dict

    def _init_model_for_predict(self):
        from .nlp.huggingface.trainer import TrainerForAuto
-        from .nlp.huggingface.data_collator import DataCollatorForPredict
-        from .nlp.utils import load_model

-        this_model = load_model(
-            checkpoint_path=self._checkpoint_path,
-            task=self._task,
-            num_labels=self._num_labels,
-            per_model_config=self._per_model_config,
+        """
+            Need to reinit training_args because of a bug in deepspeed: if not reinit, the deepspeed config will be inconsistent
+            with HF config https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py#L947
+        """
+        training_args = self._TrainingArguments(
+            local_rank=-1, model_path=self._checkpoint_path, fp16=self.fp16
        )
-        training_args = self._get_training_args(local_rank=-1)
+        for key, val in self._training_args.__dict__.items():
+            if key not in ("local_rank", "model_path", "fp16"):
+                setattr(training_args, key, val)
+        self._training_args = training_args

        new_trainer = TrainerForAuto(
-            model=this_model,
-            args=training_args,
-            data_collator=DataCollatorForPredict(
-                tokenizer=self._tokenizer,
-                pad_to_multiple_of=8 if training_args.fp16 else None,
-            )
-            if self._task == MULTICHOICECLASSIFICATION
-            else None,
+            model=self._model_init(),
+            args=self._training_args,
+            data_collator=self.data_collator,
            compute_metrics=self._compute_metrics_by_dataset_name,
        )
        if self._task in NLG_TASKS:
            setattr(new_trainer, "_is_seq2seq", True)
-        return new_trainer, training_args
+        return new_trainer

-    def predict_proba(self, X, **kwargs):
+    def predict_proba(self, X, **pred_kwargs):
        from datasets import Dataset

-        self._update_hf_args(kwargs)
+        if pred_kwargs:
+            for key, val in pred_kwargs.items():
+                setattr(self._training_args, key, val)
+
        assert (
            self._task in CLASSIFICATION
        ), "predict_proba() only for classification tasks."
@@ -815,43 +805,36 @@ class TransformersEstimator(BaseEstimator):
        X_test, _ = self._preprocess(X, **self._kwargs)
        test_dataset = Dataset.from_pandas(X_test)

-        new_trainer, _ = self._init_model_for_predict()
+        new_trainer = self._init_model_for_predict()
        predictions = new_trainer.predict(test_dataset)
        return predictions.predictions

    def score(self, X_val: DataFrame, y_val: Series, **kwargs):
        import transformers
-        from datasets import Dataset

        transformers.logging.set_verbosity_error()

        self._metric = kwargs["metric"]

-        if (self._task not in NLG_TASKS) and (self._task != TOKENCLASSIFICATION):
-            self._X_val, _ = self._preprocess(X=X_val)
-            self._y_val = y_val
-        else:
-            self._X_val, self._y_val = self._preprocess(X=X_val, y=y_val)
+        eval_dataset, X_val, y_val = self.preprocess_data(X_val, y_val)

-        eval_dataset = Dataset.from_pandas(
-            TransformersEstimator._join(self._X_val, self._y_val)
-        )
-
-        new_trainer, training_args = self._init_model_for_predict()
+        new_trainer = self._init_model_for_predict()
        return new_trainer.evaluate(eval_dataset)

-    def predict(self, X, **kwargs):
+    def predict(self, X, **pred_kwargs):
        import transformers
        from datasets import Dataset

        transformers.logging.set_verbosity_error()

-        self._update_hf_args(kwargs)
+        if pred_kwargs:
+            for key, val in pred_kwargs.items():
+                setattr(self._training_args, key, val)

        X_test, _ = self._preprocess(X, **self._kwargs)
        test_dataset = Dataset.from_pandas(X_test)

-        new_trainer, training_args = self._init_model_for_predict()
+        new_trainer = self._init_model_for_predict()

        if self._task not in NLG_TASKS:
            predictions = new_trainer.predict(test_dataset)
@@ -868,7 +851,7 @@ class TransformersEstimator(BaseEstimator):
        elif self._task == TOKENCLASSIFICATION:
            return np.argmax(predictions.predictions, axis=2)
        elif self._task == SUMMARIZATION:
-            decoded_preds = self._tokenizer.batch_decode(
+            decoded_preds = self.tokenizer.batch_decode(
                predictions.predictions, skip_special_tokens=True
            )
            return decoded_preds
@@ -883,6 +866,36 @@ class TransformersEstimator(BaseEstimator):
        return params


+class TransformersEstimatorModelSelection(TransformersEstimator):
+    def __init__(self, task="seq-classification", **config):
+        super().__init__(task, **config)
+
+    @classmethod
+    def search_space(cls, data_size, task, **params):
+        search_space_dict = TransformersEstimator.search_space(
+            data_size, task, **params
+        )
+
+        """
+            For model selection, use the same search space regardless of memory constraint
+            If OOM, user should change the search space themselves
+        """
+
+        search_space_dict["model_path"] = {
+            "domain": tune.choice(
+                [
+                    "google/electra-base-discriminator",
+                    "bert-base-uncased",
+                    "roberta-base",
+                    "facebook/muppet-roberta-base",
+                    "google/electra-small-discriminator",
+                ]
+            ),
+            "init_value": "facebook/muppet-roberta-base",
+        }
+        return search_space_dict
+
+
 class SKLearnEstimator(BaseEstimator):
    """The base class for tuning scikit-learn estimators."""

@@ -914,7 +927,7 @@ class LGBMEstimator(BaseEstimator):

    @classmethod
    def search_space(cls, data_size, **params):
-        upper = min(32768, int(data_size[0]))
+        upper = max(5, min(32768, int(data_size[0])))  # upper must be larger than lower
        return {
            "n_estimators": {
                "domain": tune.lograndint(lower=4, upper=upper),
@@ -1133,7 +1146,7 @@ class XGBoostEstimator(SKLearnEstimator):

    @classmethod
    def search_space(cls, data_size, **params):
-        upper = min(32768, int(data_size[0]))
+        upper = max(5, min(32768, int(data_size[0])))  # upper must be larger than lower
        return {
            "n_estimators": {
                "domain": tune.lograndint(lower=4, upper=upper),
@@ -1366,7 +1379,7 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
        lower = min(0.1, init)
        space = {
            "n_estimators": {
-                "domain": tune.lograndint(lower=4, upper=upper),
+                "domain": tune.lograndint(lower=4, upper=max(5, upper)),
                "init_value": 4,
                "low_cost_init_value": 4,
            },
@@ -1376,7 +1389,8 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
            },
            "max_leaves": {
                "domain": tune.lograndint(
-                    lower=4, upper=min(32768, RandomForestEstimator.nrows >> 1)
+                    lower=4,
+                    upper=max(5, min(32768, RandomForestEstimator.nrows >> 1)),  #
                ),
                "init_value": 4,
                "low_cost_init_value": 4,
@@ -1642,7 +1656,7 @@ class KNeighborsEstimator(BaseEstimator):
        upper = min(512, int(data_size[0] / 2))
        return {
            "n_neighbors": {
-                "domain": tune.lograndint(lower=1, upper=upper),
+                "domain": tune.lograndint(lower=1, upper=max(2, upper)),
                "init_value": 5,
                "low_cost_init_value": 1,
            },
@@ -1963,7 +1977,9 @@ class TS_SKLearn(SKLearnEstimator):
                    "low_cost_init_value": False,
                },
                "lags": {
-                    "domain": tune.randint(lower=1, upper=int(np.sqrt(data_size[0]))),
+                    "domain": tune.randint(
+                        lower=1, upper=max(2, int(np.sqrt(data_size[0])))
+                    ),
                    "init_value": 3,
                },
            }
--- a/flaml/nlp/huggingface/data_collator.py
+++ b/flaml/nlp/huggingface/data_collator.py
@@ -9,7 +9,12 @@ class DataCollatorForAuto(DataCollatorWithPadding):
        import torch

        label_name = "label" if "label" in features[0].keys() else "labels"
-        labels = [feature.pop(label_name) for feature in features]
+        labels = (
+            [feature.pop(label_name) for feature in features]
+            if label_name in features[0]
+            else None
+        )
+
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
@@ -21,7 +26,8 @@ class DataCollatorForAuto(DataCollatorWithPadding):
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
-        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
+        if labels:
+            batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch


--- a/flaml/nlp/huggingface/trainer.py
+++ b/flaml/nlp/huggingface/trainer.py
@@ -94,4 +94,5 @@ class TrainerForAuto(Seq2SeqTrainer):
        else:
            self.ckpt_to_global_step = {ckpt_dir: self.state.global_step}
            self.ckpt_to_metric = {ckpt_dir: metrics} if metrics else {}
+
        return metrics
--- a/flaml/nlp/huggingface/training_args.py
+++ b/flaml/nlp/huggingface/training_args.py
@@ -0,0 +1,142 @@
+import argparse
+from dataclasses import dataclass, field
+
+from ...data import (
+    NLG_TASKS,
+)
+from typing import Optional, List
+
+try:
+    from transformers import TrainingArguments
+except ImportError:
+    TrainingArguments = object
+
+
+@dataclass
+class TrainingArgumentsForAuto(TrainingArguments):
+    """FLAML custom TrainingArguments.
+
+    Args:
+        output_dir (str): data root directory for outputing the log, etc.
+        model_path (str, optional, defaults to "facebook/muppet-roberta-base"): A string,
+            the path of the language model file, either a path from huggingface
+            model card huggingface.co/models, or a local path for the model.
+        fp16 (bool, optional, defaults to "False"): A bool, whether to use FP16.
+        max_seq_length (int, optional, defaults to 128): An integer, the max length of the sequence.
+        ckpt_per_epoch (int, optional, defaults to 1): An integer, the number of checkpoints per epoch.
+    """
+
+    task: str = field(default="seq-classification")
+
+    output_dir: str = field(default="data/output/", metadata={"help": "data dir"})
+
+    model_path: str = field(
+        default="facebook/muppet-roberta-base",
+        metadata={
+            "help": "model path for HPO natural language understanding tasks, default is set to facebook/muppet-roberta-base"
+        },
+    )
+
+    tokenizer_model_path: str = field(
+        default=None,
+        metadata={"help": "tokenizer model path for HPO"},
+    )
+
+    fp16: bool = field(default=True, metadata={"help": "whether to use the FP16 mode"})
+
+    max_seq_length: int = field(default=128, metadata={"help": "max seq length"})
+
+    pad_to_max_length: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to pad all samples to model maximum sentence length. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+            "efficient on GPU but very bad for TPU."
+        },
+    )
+
+    ckpt_per_epoch: int = field(default=1, metadata={"help": "checkpoint per epoch"})
+
+    per_device_eval_batch_size: int = field(
+        default=1,
+        metadata={"help": "per gpu evaluation batch size"},
+    )
+
+    report_to: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "The list of integrations to report the results and logs to."
+        },
+    )
+
+    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
+    do_eval: bool = field(
+        default=False, metadata={"help": "Whether to run eval on the dev set."}
+    )
+
+    metric_for_best_model: Optional[str] = field(
+        default="loss",
+        metadata={"help": "The metric to use to compare two different models."},
+    )
+
+    @staticmethod
+    def load_args_from_console():
+        from dataclasses import fields
+
+        arg_parser = argparse.ArgumentParser()
+        for each_field in fields(TrainingArgumentsForAuto):
+            print(each_field)
+            arg_parser.add_argument(
+                "--" + each_field.name,
+                type=each_field.type,
+                help=each_field.metadata["help"],
+                required=each_field.metadata["required"]
+                if "required" in each_field.metadata
+                else False,
+                choices=each_field.metadata["choices"]
+                if "choices" in each_field.metadata
+                else None,
+                default=each_field.default,
+            )
+        console_args, unknown = arg_parser.parse_known_args()
+        return console_args
+
+
+@dataclass
+class Seq2SeqTrainingArgumentsForAuto(TrainingArgumentsForAuto):
+
+    model_path: str = field(
+        default="t5-small",
+        metadata={
+            "help": "model path for HPO natural language generation tasks, default is set to t5-small"
+        },
+    )
+
+    sortish_sampler: bool = field(
+        default=False, metadata={"help": "Whether to use SortishSampler or not."}
+    )
+    predict_with_generate: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."
+        },
+    )
+    generation_max_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default "
+            "to the `max_length` value of the model configuration."
+        },
+    )
+    generation_num_beams: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default "
+            "to the `num_beams` value of the model configuration."
+        },
+    )
+
+    def __post_init__(self):
+        super().__post_init__()
+        if self.task in NLG_TASKS:
+            self.model_path = "t5-small"
--- a/flaml/nlp/utils.py
+++ b/flaml/nlp/utils.py
@@ -1,5 +1,3 @@
-import argparse
-from dataclasses import dataclass, field
 from itertools import chain
 from typing import Dict, Any
 import numpy as np
@@ -28,21 +26,6 @@ def load_default_huggingface_metric_for_task(task):
        return "seqeval"


-def get_auto_tokenizer(tokenizer_model_path, task):
-    from transformers import AutoTokenizer
-
-    if task == SUMMARIZATION:
-        return AutoTokenizer.from_pretrained(
-            pretrained_model_name_or_path=tokenizer_model_path,
-            cache_dir=None,
-            use_fast=True,
-            revision="main",
-            use_auth_token=None,
-        )
-    else:
-        return AutoTokenizer.from_pretrained(tokenizer_model_path, use_fast=True)
-
-
 def tokenize_text(X, Y=None, task=None, hf_args=None, tokenizer=None):
    if task in (SEQCLASSIFICATION, SEQREGRESSION):
        X_tokenized = tokenize_onedataframe(
@@ -332,42 +315,6 @@ def tokenize_swag(this_row, tokenizer, hf_args=None, return_column_name=False):
        return [tokenized_example[x] for x in tmp_column_names]


-def separate_config(config, task):
-    if task in NLG_TASKS:
-        from transformers import Seq2SeqTrainingArguments, TrainingArguments
-
-        trainargs_class_list = [Seq2SeqTrainingArguments, TrainingArguments]
-    else:
-        from transformers import TrainingArguments
-
-        trainargs_class_list = [TrainingArguments]
-
-    training_args_config = {}
-    per_model_config = {}
-
-    for key, val in config.items():
-        is_in_training_args = any(key in x.__dict__ for x in trainargs_class_list)
-        if is_in_training_args:
-            training_args_config[key] = val
-        else:
-            per_model_config[key] = val
-
-    return training_args_config, per_model_config
-
-
-def get_num_labels(task, y_train):
-    from ..data import SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION
-
-    if task == SEQREGRESSION:
-        return 1
-    elif task == SEQCLASSIFICATION:
-        return len(set(y_train))
-    elif task == TOKENCLASSIFICATION:
-        return len(set([a for b in y_train.tolist() for a in b]))
-    else:
-        return None
-
-
 def is_a_list_of_str(this_obj):
    return (isinstance(this_obj, list) or isinstance(this_obj, np.ndarray)) and all(
        isinstance(x, str) for x in this_obj
@@ -439,7 +386,7 @@ class Counter:
        return logdir


-def load_model(checkpoint_path, task, num_labels=None, per_model_config=None):
+def load_model(checkpoint_path, task, num_labels=None):
    import transformers

    transformers.logging.set_verbosity_error()
@@ -479,25 +426,13 @@ def load_model(checkpoint_path, task, num_labels=None, per_model_config=None):

    def _set_model_config(checkpoint_path):
        if task in (SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION):
-            if per_model_config:
-                model_config = AutoConfig.from_pretrained(
-                    checkpoint_path,
-                    num_labels=model_config_num_labels,
-                    **per_model_config,
-                )
-            else:
-                model_config = AutoConfig.from_pretrained(
-                    checkpoint_path, num_labels=model_config_num_labels
-                )
+            model_config = AutoConfig.from_pretrained(
+                checkpoint_path,
+                num_labels=model_config_num_labels,
+            )
            return model_config
        else:
-            if per_model_config:
-                model_config = AutoConfig.from_pretrained(
-                    checkpoint_path,
-                    **per_model_config,
-                )
-            else:
-                model_config = AutoConfig.from_pretrained(checkpoint_path)
+            model_config = AutoConfig.from_pretrained(checkpoint_path)
            return model_config

    current_config = AutoConfig.from_pretrained(checkpoint_path)
@@ -538,97 +473,3 @@ def load_model(checkpoint_path, task, num_labels=None, per_model_config=None):
        model_config = _set_model_config(checkpoint_path)
        this_model = get_this_model(checkpoint_path, task, model_config)
        return this_model
-
-
-def compute_checkpoint_freq(
-    train_data_size,
-    hf_args,
-    num_train_epochs,
-    batch_size,
-):
-    ckpt_step_freq = (
-        int(
-            min(num_train_epochs, 1)
-            * train_data_size
-            / batch_size
-            / hf_args.ckpt_per_epoch
-        )
-        + 1
-    )
-    return ckpt_step_freq
-
-
-@dataclass
-class HFArgs:
-    """The HPO setting.
-    Args:
-        output_dir (str): data root directory for outputing the log, etc.
-        model_path (str, optional, defaults to "facebook/muppet-roberta-base"): A string,
-            the path of the language model file, either a path from huggingface
-            model card huggingface.co/models, or a local path for the model.
-        fp16 (bool, optional, defaults to "False"): A bool, whether to use FP16.
-        max_seq_length (int, optional, defaults to 128): An integer, the max length of the sequence.
-        ckpt_per_epoch (int, optional, defaults to 1): An integer, the number of checkpoints per epoch.
-    """
-
-    task: str = field(default="seq-classification")
-
-    output_dir: str = field(
-        default="data/output/", metadata={"help": "data dir", "required": True}
-    )
-
-    model_path: str = field(
-        default="facebook/muppet-roberta-base",
-        metadata={"help": "model path for HPO"},
-    )
-
-    tokenizer_model_path: str = field(
-        default=None,
-        metadata={"help": "tokenizer model path for HPO"},
-    )
-
-    fp16: bool = field(default=True, metadata={"help": "whether to use the FP16 mode"})
-
-    max_seq_length: int = field(default=128, metadata={"help": "max seq length"})
-
-    pad_to_max_length: bool = field(
-        default=True,
-        metadata={
-            "help": "Whether to pad all samples to model maximum sentence length. "
-            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
-            "efficient on GPU but very bad for TPU."
-        },
-    )
-
-    ckpt_per_epoch: int = field(default=1, metadata={"help": "checkpoint per epoch"})
-
-    per_device_eval_batch_size: int = field(
-        default=1,
-        metadata={"help": "per gpu evaluation batch size"},
-    )
-
-    def __post_init__(self):
-        if self.task in NLG_TASKS:
-            self.model_path = "t5-small"
-
-    @staticmethod
-    def load_args_from_console():
-        from dataclasses import fields
-
-        arg_parser = argparse.ArgumentParser()
-        for each_field in fields(HFArgs):
-            print(each_field)
-            arg_parser.add_argument(
-                "--" + each_field.name,
-                type=each_field.type,
-                help=each_field.metadata["help"],
-                required=each_field.metadata["required"]
-                if "required" in each_field.metadata
-                else False,
-                choices=each_field.metadata["choices"]
-                if "choices" in each_field.metadata
-                else None,
-                default=each_field.default,
-            )
-        console_args, unknown = arg_parser.parse_known_args()
-        return console_args
--- a/flaml/version.py
+++ b/flaml/version.py
@@ -1 +1 @@
-__version__ = "1.0.1"
+__version__ = "1.0.2"
--- a/notebook/automl_nlp.ipynb
+++ b/notebook/automl_nlp.ipynb
@@ -369,12 +369,12 @@
      "  from pandas import MultiIndex, Int64Index\n",
      "2022-03-19 13:59:38,805\tERROR services.py:1421 -- Failed to start the dashboard: Failed to start the dashboard, return code 1\n",
      "Failed to read dashboard log: [Errno 2] No such file or directory: '/tmp/ray/session_2022-03-19_13-59-37_048935_85869/logs/dashboard.log'\n",
-      "\u001B[2m\u001B[33m(raylet)\u001B[0m Traceback (most recent call last):\n",
-      "\u001B[2m\u001B[33m(raylet)\u001B[0m   File \"/data/installation/anaconda3/envs/tmp/lib/python3.8/site-packages/ray/dashboard/agent.py\", line 21, in <module>\n",
-      "\u001B[2m\u001B[33m(raylet)\u001B[0m     import ray.dashboard.utils as dashboard_utils\n",
-      "\u001B[2m\u001B[33m(raylet)\u001B[0m   File \"/data/installation/anaconda3/envs/tmp/lib/python3.8/site-packages/ray/dashboard/utils.py\", line 15, in <module>\n",
-      "\u001B[2m\u001B[33m(raylet)\u001B[0m     import aioredis  # noqa: F401\n",
-      "\u001B[2m\u001B[33m(raylet)\u001B[0m ModuleNotFoundError: No module named 'aioredis'\n"
+      "\u001b[2m\u001b[33m(raylet)\u001b[0m Traceback (most recent call last):\n",
+      "\u001b[2m\u001b[33m(raylet)\u001b[0m   File \"/data/installation/anaconda3/envs/tmp/lib/python3.8/site-packages/ray/dashboard/agent.py\", line 21, in <module>\n",
+      "\u001b[2m\u001b[33m(raylet)\u001b[0m     import ray.dashboard.utils as dashboard_utils\n",
+      "\u001b[2m\u001b[33m(raylet)\u001b[0m   File \"/data/installation/anaconda3/envs/tmp/lib/python3.8/site-packages/ray/dashboard/utils.py\", line 15, in <module>\n",
+      "\u001b[2m\u001b[33m(raylet)\u001b[0m     import aioredis  # noqa: F401\n",
+      "\u001b[2m\u001b[33m(raylet)\u001b[0m ModuleNotFoundError: No module named 'aioredis'\n"
     ]
    }
   ],
@@ -398,11 +398,13 @@
    "automl_settings = {\n",
    "    \"time_budget\": 500,                  # setting the time budget\n",
    "    \"task\": \"seq-classification\",       # setting the task as seq-classification\n",
-    "    \"hf_args\": {\n",
-    "        \"output_dir\": \"data/output/\",   # setting the output directory\n",
-    "        \"ckpt_per_epoch\": 1,            # setting the number of checkoints per epoch\n",
-    "        \"model_path\": \"google/electra-base-discriminator\",\n",
-    "        \"per_device_eval_batch_size\": 16   # the batch size for validation (inference)\n",
+    "    \"fit_kwargs_by_estimator\": {\n",
+    "        \"transformer\": {\n",
+    "            \"output_dir\": \"data/output/\",   # setting the output directory\n",
+    "            \"ckpt_per_epoch\": 1,            # setting the number of checkoints per epoch\n",
+    "            \"model_path\": \"google/electra-base-discriminator\",  # if model_path is not set, the default model is facebook/muppet-roberta-base: https://huggingface.co/facebook/muppet-roberta-base\n",
+    "            \"per_device_eval_batch_size\": 16   # the batch size for validation (inference)\n",
+    "        }\n",
    "    },\n",
    "    \"gpu_per_trial\": 1,                 # set to 0 if no GPU is available\n",
    "    \"log_file_name\": \"seqclass.log\",    # set the file to save the log for HPO\n",
@@ -434,53 +436,53 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86265)\u001B[0m {'loss': 0.688, 'learning_rate': 5.91472422187606e-05, 'epoch': 6.0}\n",
-      "\u001B[2m\u001B[36m(train pid=86265)\u001B[0m {'eval_loss': 0.6975926160812378, 'eval_automl_metric': 0.49082568807339455, 'eval_runtime': 0.9885, 'eval_samples_per_second': 882.13, 'eval_steps_per_second': 55.639, 'epoch': 6.0}\n",
-      "\u001B[2m\u001B[36m(train pid=86265)\u001B[0m {'eval_loss': 0.6959978938102722, 'eval_automl_metric': 0.49082568807339455, 'eval_runtime': 0.9612, 'eval_samples_per_second': 907.223, 'eval_steps_per_second': 57.222, 'epoch': 6.04}\n",
-      "\u001B[2m\u001B[36m(train pid=86255)\u001B[0m {'loss': 0.133, 'learning_rate': 1.9966093201540806e-05, 'epoch': 3.0}\n",
-      "\u001B[2m\u001B[36m(train pid=86255)\u001B[0m {'eval_loss': 0.29954516887664795, 'eval_automl_metric': 0.06880733944954132, 'eval_runtime': 0.9772, 'eval_samples_per_second': 892.372, 'eval_steps_per_second': 56.285, 'epoch': 3.0}\n",
-      "\u001B[2m\u001B[36m(train pid=86265)\u001B[0m {'eval_loss': 0.6959978938102722, 'eval_automl_metric': 0.49082568807339455, 'eval_runtime': 0.9707, 'eval_samples_per_second': 898.292, 'eval_steps_per_second': 56.658, 'epoch': 6.04}\n",
-      "\u001B[2m\u001B[36m(train pid=86265)\u001B[0m {'train_runtime': 510.9986, 'train_samples_per_second': 142.692, 'train_steps_per_second': 17.838, 'train_loss': 0.6829553683756834, 'epoch': 6.04}\n"
+      "\u001b[2m\u001b[36m(train pid=86265)\u001b[0m {'loss': 0.688, 'learning_rate': 5.91472422187606e-05, 'epoch': 6.0}\n",
+      "\u001b[2m\u001b[36m(train pid=86265)\u001b[0m {'eval_loss': 0.6975926160812378, 'eval_automl_metric': 0.49082568807339455, 'eval_runtime': 0.9885, 'eval_samples_per_second': 882.13, 'eval_steps_per_second': 55.639, 'epoch': 6.0}\n",
+      "\u001b[2m\u001b[36m(train pid=86265)\u001b[0m {'eval_loss': 0.6959978938102722, 'eval_automl_metric': 0.49082568807339455, 'eval_runtime': 0.9612, 'eval_samples_per_second': 907.223, 'eval_steps_per_second': 57.222, 'epoch': 6.04}\n",
+      "\u001b[2m\u001b[36m(train pid=86255)\u001b[0m {'loss': 0.133, 'learning_rate': 1.9966093201540806e-05, 'epoch': 3.0}\n",
+      "\u001b[2m\u001b[36m(train pid=86255)\u001b[0m {'eval_loss': 0.29954516887664795, 'eval_automl_metric': 0.06880733944954132, 'eval_runtime': 0.9772, 'eval_samples_per_second': 892.372, 'eval_steps_per_second': 56.285, 'epoch': 3.0}\n",
+      "\u001b[2m\u001b[36m(train pid=86265)\u001b[0m {'eval_loss': 0.6959978938102722, 'eval_automl_metric': 0.49082568807339455, 'eval_runtime': 0.9707, 'eval_samples_per_second': 898.292, 'eval_steps_per_second': 56.658, 'epoch': 6.04}\n",
+      "\u001b[2m\u001b[36m(train pid=86265)\u001b[0m {'train_runtime': 510.9986, 'train_samples_per_second': 142.692, 'train_steps_per_second': 17.838, 'train_loss': 0.6829553683756834, 'epoch': 6.04}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86265)\u001B[0m Using amp half precision backend\n",
-      "\u001B[2m\u001B[36m(train pid=86265)\u001B[0m ***** Running Prediction *****\n",
-      "\u001B[2m\u001B[36m(train pid=86265)\u001B[0m   Num examples = 872\n",
-      "\u001B[2m\u001B[36m(train pid=86265)\u001B[0m   Batch size = 16\n"
+      "\u001b[2m\u001b[36m(train pid=86265)\u001b[0m Using amp half precision backend\n",
+      "\u001b[2m\u001b[36m(train pid=86265)\u001b[0m ***** Running Prediction *****\n",
+      "\u001b[2m\u001b[36m(train pid=86265)\u001b[0m   Num examples = 872\n",
+      "\u001b[2m\u001b[36m(train pid=86265)\u001b[0m   Batch size = 16\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'loss': 0.2158, 'learning_rate': 5.454224012822699e-06, 'epoch': 3.0}\n",
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'eval_loss': 0.21694229543209076, 'eval_automl_metric': 0.06422018348623848, 'eval_runtime': 0.9902, 'eval_samples_per_second': 880.648, 'eval_steps_per_second': 55.545, 'epoch': 3.0}\n",
-      "\u001B[2m\u001B[36m(train pid=86269)\u001B[0m {'eval_loss': 0.5855236649513245, 'eval_automl_metric': 0.18463302752293576, 'eval_runtime': 0.9877, 'eval_samples_per_second': 882.845, 'eval_steps_per_second': 55.684, 'epoch': 4.53}\n",
-      "\u001B[2m\u001B[36m(train pid=86269)\u001B[0m {'train_runtime': 373.6856, 'train_samples_per_second': 121.13, 'train_steps_per_second': 15.144, 'train_loss': 0.4671156674776214, 'epoch': 4.53}\n"
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'loss': 0.2158, 'learning_rate': 5.454224012822699e-06, 'epoch': 3.0}\n",
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'eval_loss': 0.21694229543209076, 'eval_automl_metric': 0.06422018348623848, 'eval_runtime': 0.9902, 'eval_samples_per_second': 880.648, 'eval_steps_per_second': 55.545, 'epoch': 3.0}\n",
+      "\u001b[2m\u001b[36m(train pid=86269)\u001b[0m {'eval_loss': 0.5855236649513245, 'eval_automl_metric': 0.18463302752293576, 'eval_runtime': 0.9877, 'eval_samples_per_second': 882.845, 'eval_steps_per_second': 55.684, 'epoch': 4.53}\n",
+      "\u001b[2m\u001b[36m(train pid=86269)\u001b[0m {'train_runtime': 373.6856, 'train_samples_per_second': 121.13, 'train_steps_per_second': 15.144, 'train_loss': 0.4671156674776214, 'epoch': 4.53}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86269)\u001B[0m Using amp half precision backend\n",
-      "\u001B[2m\u001B[36m(train pid=86269)\u001B[0m ***** Running Prediction *****\n",
-      "\u001B[2m\u001B[36m(train pid=86269)\u001B[0m   Num examples = 872\n",
-      "\u001B[2m\u001B[36m(train pid=86269)\u001B[0m   Batch size = 16\n"
+      "\u001b[2m\u001b[36m(train pid=86269)\u001b[0m Using amp half precision backend\n",
+      "\u001b[2m\u001b[36m(train pid=86269)\u001b[0m ***** Running Prediction *****\n",
+      "\u001b[2m\u001b[36m(train pid=86269)\u001b[0m   Num examples = 872\n",
+      "\u001b[2m\u001b[36m(train pid=86269)\u001b[0m   Batch size = 16\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86255)\u001B[0m {'loss': 0.0655, 'learning_rate': 1.1451941740938257e-05, 'epoch': 4.01}\n",
-      "\u001B[2m\u001B[36m(train pid=86255)\u001B[0m {'eval_loss': 0.32505378127098083, 'eval_automl_metric': 0.06307339449541283, 'eval_runtime': 0.9623, 'eval_samples_per_second': 906.171, 'eval_steps_per_second': 57.155, 'epoch': 4.01}\n",
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'loss': 0.1585, 'learning_rate': 4.673980815375941e-06, 'epoch': 4.01}\n",
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'eval_loss': 0.26288139820098877, 'eval_automl_metric': 0.06422018348623848, 'eval_runtime': 0.9914, 'eval_samples_per_second': 879.599, 'eval_steps_per_second': 55.479, 'epoch': 4.01}\n"
+      "\u001b[2m\u001b[36m(train pid=86255)\u001b[0m {'loss': 0.0655, 'learning_rate': 1.1451941740938257e-05, 'epoch': 4.01}\n",
+      "\u001b[2m\u001b[36m(train pid=86255)\u001b[0m {'eval_loss': 0.32505378127098083, 'eval_automl_metric': 0.06307339449541283, 'eval_runtime': 0.9623, 'eval_samples_per_second': 906.171, 'eval_steps_per_second': 57.155, 'epoch': 4.01}\n",
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'loss': 0.1585, 'learning_rate': 4.673980815375941e-06, 'epoch': 4.01}\n",
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'eval_loss': 0.26288139820098877, 'eval_automl_metric': 0.06422018348623848, 'eval_runtime': 0.9914, 'eval_samples_per_second': 879.599, 'eval_steps_per_second': 55.479, 'epoch': 4.01}\n"
     ]
    },
    {
@@ -494,31 +496,31 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86255)\u001B[0m {'eval_loss': 0.37855082750320435, 'eval_automl_metric': 0.06536697247706424, 'eval_runtime': 0.9622, 'eval_samples_per_second': 906.215, 'eval_steps_per_second': 57.158, 'epoch': 5.0}\n",
-      "\u001B[2m\u001B[36m(train pid=86255)\u001B[0m {'loss': 0.0323, 'learning_rate': 2.937790280335705e-06, 'epoch': 5.01}\n",
-      "\u001B[2m\u001B[36m(train pid=86255)\u001B[0m {'eval_loss': 0.38091975450515747, 'eval_automl_metric': 0.06536697247706424, 'eval_runtime': 0.9618, 'eval_samples_per_second': 906.667, 'eval_steps_per_second': 57.187, 'epoch': 5.01}\n",
-      "\u001B[2m\u001B[36m(train pid=86255)\u001B[0m {'eval_loss': 0.4038548171520233, 'eval_automl_metric': 0.06995412844036697, 'eval_runtime': 0.9613, 'eval_samples_per_second': 907.15, 'eval_steps_per_second': 57.217, 'epoch': 5.35}\n",
-      "\u001B[2m\u001B[36m(train pid=86255)\u001B[0m {'train_runtime': 240.3679, 'train_samples_per_second': 222.334, 'train_steps_per_second': 13.9, 'train_loss': 0.1605183750667532, 'epoch': 5.35}\n"
+      "\u001b[2m\u001b[36m(train pid=86255)\u001b[0m {'eval_loss': 0.37855082750320435, 'eval_automl_metric': 0.06536697247706424, 'eval_runtime': 0.9622, 'eval_samples_per_second': 906.215, 'eval_steps_per_second': 57.158, 'epoch': 5.0}\n",
+      "\u001b[2m\u001b[36m(train pid=86255)\u001b[0m {'loss': 0.0323, 'learning_rate': 2.937790280335705e-06, 'epoch': 5.01}\n",
+      "\u001b[2m\u001b[36m(train pid=86255)\u001b[0m {'eval_loss': 0.38091975450515747, 'eval_automl_metric': 0.06536697247706424, 'eval_runtime': 0.9618, 'eval_samples_per_second': 906.667, 'eval_steps_per_second': 57.187, 'epoch': 5.01}\n",
+      "\u001b[2m\u001b[36m(train pid=86255)\u001b[0m {'eval_loss': 0.4038548171520233, 'eval_automl_metric': 0.06995412844036697, 'eval_runtime': 0.9613, 'eval_samples_per_second': 907.15, 'eval_steps_per_second': 57.217, 'epoch': 5.35}\n",
+      "\u001b[2m\u001b[36m(train pid=86255)\u001b[0m {'train_runtime': 240.3679, 'train_samples_per_second': 222.334, 'train_steps_per_second': 13.9, 'train_loss': 0.1605183750667532, 'epoch': 5.35}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86255)\u001B[0m Using amp half precision backend\n",
-      "\u001B[2m\u001B[36m(train pid=86255)\u001B[0m ***** Running Prediction *****\n",
-      "\u001B[2m\u001B[36m(train pid=86255)\u001B[0m   Num examples = 872\n",
-      "\u001B[2m\u001B[36m(train pid=86255)\u001B[0m   Batch size = 16\n"
+      "\u001b[2m\u001b[36m(train pid=86255)\u001b[0m Using amp half precision backend\n",
+      "\u001b[2m\u001b[36m(train pid=86255)\u001b[0m ***** Running Prediction *****\n",
+      "\u001b[2m\u001b[36m(train pid=86255)\u001b[0m   Num examples = 872\n",
+      "\u001b[2m\u001b[36m(train pid=86255)\u001b[0m   Batch size = 16\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'loss': 0.1288, 'learning_rate': 3.893737617929184e-06, 'epoch': 5.01}\n",
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'eval_loss': 0.2783028781414032, 'eval_automl_metric': 0.06536697247706424, 'eval_runtime': 0.9904, 'eval_samples_per_second': 880.481, 'eval_steps_per_second': 55.535, 'epoch': 5.01}\n",
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'loss': 0.0921, 'learning_rate': 3.114740815366527e-06, 'epoch': 6.01}\n",
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'eval_loss': 0.2939322590827942, 'eval_automl_metric': 0.06422018348623848, 'eval_runtime': 0.9917, 'eval_samples_per_second': 879.284, 'eval_steps_per_second': 55.459, 'epoch': 6.01}\n"
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'loss': 0.1288, 'learning_rate': 3.893737617929184e-06, 'epoch': 5.01}\n",
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'eval_loss': 0.2783028781414032, 'eval_automl_metric': 0.06536697247706424, 'eval_runtime': 0.9904, 'eval_samples_per_second': 880.481, 'eval_steps_per_second': 55.535, 'epoch': 5.01}\n",
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'loss': 0.0921, 'learning_rate': 3.114740815366527e-06, 'epoch': 6.01}\n",
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'eval_loss': 0.2939322590827942, 'eval_automl_metric': 0.06422018348623848, 'eval_runtime': 0.9917, 'eval_samples_per_second': 879.284, 'eval_steps_per_second': 55.459, 'epoch': 6.01}\n"
     ]
    },
    {
@@ -532,8 +534,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'loss': 0.0749, 'learning_rate': 2.3357440128038706e-06, 'epoch': 7.01}\n",
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'eval_loss': 0.3375593423843384, 'eval_automl_metric': 0.0665137614678899, 'eval_runtime': 1.0158, 'eval_samples_per_second': 858.42, 'eval_steps_per_second': 54.143, 'epoch': 7.01}\n"
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'loss': 0.0749, 'learning_rate': 2.3357440128038706e-06, 'epoch': 7.01}\n",
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'eval_loss': 0.3375593423843384, 'eval_automl_metric': 0.0665137614678899, 'eval_runtime': 1.0158, 'eval_samples_per_second': 858.42, 'eval_steps_per_second': 54.143, 'epoch': 7.01}\n"
     ]
    },
    {
@@ -547,8 +549,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'loss': 0.06, 'learning_rate': 1.5555008153571132e-06, 'epoch': 8.01}\n",
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'eval_loss': 0.33065176010131836, 'eval_automl_metric': 0.06307339449541283, 'eval_runtime': 0.9921, 'eval_samples_per_second': 878.905, 'eval_steps_per_second': 55.436, 'epoch': 8.01}\n"
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'loss': 0.06, 'learning_rate': 1.5555008153571132e-06, 'epoch': 8.01}\n",
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'eval_loss': 0.33065176010131836, 'eval_automl_metric': 0.06307339449541283, 'eval_runtime': 0.9921, 'eval_samples_per_second': 878.905, 'eval_steps_per_second': 55.436, 'epoch': 8.01}\n"
     ]
    },
    {
@@ -562,9 +564,9 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'eval_loss': 0.34692466259002686, 'eval_automl_metric': 0.06307339449541283, 'eval_runtime': 0.9923, 'eval_samples_per_second': 878.777, 'eval_steps_per_second': 55.427, 'epoch': 9.0}\n",
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'loss': 0.0547, 'learning_rate': 7.752576179103561e-07, 'epoch': 9.01}\n",
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'eval_loss': 0.3511528968811035, 'eval_automl_metric': 0.06307339449541283, 'eval_runtime': 1.0097, 'eval_samples_per_second': 863.616, 'eval_steps_per_second': 54.471, 'epoch': 9.01}\n"
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'eval_loss': 0.34692466259002686, 'eval_automl_metric': 0.06307339449541283, 'eval_runtime': 0.9923, 'eval_samples_per_second': 878.777, 'eval_steps_per_second': 55.427, 'epoch': 9.0}\n",
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'loss': 0.0547, 'learning_rate': 7.752576179103561e-07, 'epoch': 9.01}\n",
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'eval_loss': 0.3511528968811035, 'eval_automl_metric': 0.06307339449541283, 'eval_runtime': 1.0097, 'eval_samples_per_second': 863.616, 'eval_steps_per_second': 54.471, 'epoch': 9.01}\n"
     ]
    },
    {
@@ -578,18 +580,18 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'eval_loss': 0.36141160130500793, 'eval_automl_metric': 0.06192660550458717, 'eval_runtime': 0.9929, 'eval_samples_per_second': 878.273, 'eval_steps_per_second': 55.396, 'epoch': 10.0}\n",
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m {'train_runtime': 482.4828, 'train_samples_per_second': 207.261, 'train_steps_per_second': 12.954, 'train_loss': 0.1848659490966797, 'epoch': 10.0}\n"
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'eval_loss': 0.36141160130500793, 'eval_automl_metric': 0.06192660550458717, 'eval_runtime': 0.9929, 'eval_samples_per_second': 878.273, 'eval_steps_per_second': 55.396, 'epoch': 10.0}\n",
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m {'train_runtime': 482.4828, 'train_samples_per_second': 207.261, 'train_steps_per_second': 12.954, 'train_loss': 0.1848659490966797, 'epoch': 10.0}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m Using amp half precision backend\n",
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m ***** Running Prediction *****\n",
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m   Num examples = 872\n",
-      "\u001B[2m\u001B[36m(train pid=86211)\u001B[0m   Batch size = 16\n",
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m Using amp half precision backend\n",
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m ***** Running Prediction *****\n",
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m   Num examples = 872\n",
+      "\u001b[2m\u001b[36m(train pid=86211)\u001b[0m   Batch size = 16\n",
      "2022-03-19 14:14:22,379\tINFO tune.py:639 -- Total run time: 875.36 seconds (500.34 seconds for the tuning loop).\n",
      "[flaml.automl: 03-19 14:14:25] {2837} INFO - selected model: None\n",
      "/data/installation/anaconda3/envs/tmp/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
@@ -729,7 +731,7 @@
   ],
   "source": [
    "'''compute predictions of testing dataset''' \n",
-    "y_pred = automl.predict(X_test, **{\"hf_args\": {\"per_device_eval_batch_size\": 1}})\n",
+    "y_pred = automl.predict(X_test, **{\"per_device_eval_batch_size\": 1})\n",
    "print('Predicted labels', y_pred)"
   ]
  },
@@ -934,47 +936,47 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86157)\u001B[0m {'eval_loss': 0.6315866112709045, 'eval_automl_metric': 0.18779999999999997, 'eval_runtime': 15.4883, 'eval_samples_per_second': 645.648, 'eval_steps_per_second': 40.353, 'epoch': 1.66}\n",
-      "\u001B[2m\u001B[36m(train pid=86157)\u001B[0m {'train_runtime': 190.7625, 'train_samples_per_second': 87.254, 'train_steps_per_second': 10.909, 'train_loss': 0.5091343906738046, 'epoch': 1.66}\n",
-      "\u001B[2m\u001B[36m(train pid=86249)\u001B[0m {'eval_loss': 1.2118068933486938, 'eval_automl_metric': 0.2015, 'eval_runtime': 15.2585, 'eval_samples_per_second': 655.374, 'eval_steps_per_second': 40.961, 'epoch': 2.87}\n"
+      "\u001b[2m\u001b[36m(train pid=86157)\u001b[0m {'eval_loss': 0.6315866112709045, 'eval_automl_metric': 0.18779999999999997, 'eval_runtime': 15.4883, 'eval_samples_per_second': 645.648, 'eval_steps_per_second': 40.353, 'epoch': 1.66}\n",
+      "\u001b[2m\u001b[36m(train pid=86157)\u001b[0m {'train_runtime': 190.7625, 'train_samples_per_second': 87.254, 'train_steps_per_second': 10.909, 'train_loss': 0.5091343906738046, 'epoch': 1.66}\n",
+      "\u001b[2m\u001b[36m(train pid=86249)\u001b[0m {'eval_loss': 1.2118068933486938, 'eval_automl_metric': 0.2015, 'eval_runtime': 15.2585, 'eval_samples_per_second': 655.374, 'eval_steps_per_second': 40.961, 'epoch': 2.87}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86157)\u001B[0m Using amp half precision backend\n",
-      "\u001B[2m\u001B[36m(train pid=86157)\u001B[0m The following columns in the test set  don't have a corresponding argument in `RobertaForMultipleChoice.forward` and have been ignored: ending3, ending1, video-id, sent1, ending0, sent2, fold-ind, ending2, startphrase, gold-source. If ending3, ending1, video-id, sent1, ending0, sent2, fold-ind, ending2, startphrase, gold-source are not expected by `RobertaForMultipleChoice.forward`,  you can safely ignore this message.\n",
-      "\u001B[2m\u001B[36m(train pid=86157)\u001B[0m ***** Running Prediction *****\n",
-      "\u001B[2m\u001B[36m(train pid=86157)\u001B[0m   Num examples = 10000\n",
-      "\u001B[2m\u001B[36m(train pid=86157)\u001B[0m   Batch size = 16\n"
+      "\u001b[2m\u001b[36m(train pid=86157)\u001b[0m Using amp half precision backend\n",
+      "\u001b[2m\u001b[36m(train pid=86157)\u001b[0m The following columns in the test set  don't have a corresponding argument in `RobertaForMultipleChoice.forward` and have been ignored: ending3, ending1, video-id, sent1, ending0, sent2, fold-ind, ending2, startphrase, gold-source. If ending3, ending1, video-id, sent1, ending0, sent2, fold-ind, ending2, startphrase, gold-source are not expected by `RobertaForMultipleChoice.forward`,  you can safely ignore this message.\n",
+      "\u001b[2m\u001b[36m(train pid=86157)\u001b[0m ***** Running Prediction *****\n",
+      "\u001b[2m\u001b[36m(train pid=86157)\u001b[0m   Num examples = 10000\n",
+      "\u001b[2m\u001b[36m(train pid=86157)\u001b[0m   Batch size = 16\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86249)\u001B[0m {'eval_loss': 1.2118068933486938, 'eval_automl_metric': 0.2015, 'eval_runtime': 15.1369, 'eval_samples_per_second': 660.639, 'eval_steps_per_second': 41.29, 'epoch': 2.87}\n",
-      "\u001B[2m\u001B[36m(train pid=86249)\u001B[0m {'train_runtime': 546.3809, 'train_samples_per_second': 156.658, 'train_steps_per_second': 39.165, 'train_loss': 0.5030154804349909, 'epoch': 2.87}\n",
-      "\u001B[2m\u001B[36m(train pid=86195)\u001B[0m {'loss': 0.4854, 'learning_rate': 1.3592147782116173e-06, 'epoch': 2.0}\n"
+      "\u001b[2m\u001b[36m(train pid=86249)\u001b[0m {'eval_loss': 1.2118068933486938, 'eval_automl_metric': 0.2015, 'eval_runtime': 15.1369, 'eval_samples_per_second': 660.639, 'eval_steps_per_second': 41.29, 'epoch': 2.87}\n",
+      "\u001b[2m\u001b[36m(train pid=86249)\u001b[0m {'train_runtime': 546.3809, 'train_samples_per_second': 156.658, 'train_steps_per_second': 39.165, 'train_loss': 0.5030154804349909, 'epoch': 2.87}\n",
+      "\u001b[2m\u001b[36m(train pid=86195)\u001b[0m {'loss': 0.4854, 'learning_rate': 1.3592147782116173e-06, 'epoch': 2.0}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86249)\u001B[0m Using amp half precision backend\n",
-      "\u001B[2m\u001B[36m(train pid=86249)\u001B[0m The following columns in the test set  don't have a corresponding argument in `RobertaForMultipleChoice.forward` and have been ignored: fold-ind, sent2, gold-source, ending1, startphrase, sent1, ending0, video-id, ending2, ending3. If fold-ind, sent2, gold-source, ending1, startphrase, sent1, ending0, video-id, ending2, ending3 are not expected by `RobertaForMultipleChoice.forward`,  you can safely ignore this message.\n",
-      "\u001B[2m\u001B[36m(train pid=86249)\u001B[0m ***** Running Prediction *****\n",
-      "\u001B[2m\u001B[36m(train pid=86249)\u001B[0m   Num examples = 10000\n",
-      "\u001B[2m\u001B[36m(train pid=86249)\u001B[0m   Batch size = 16\n"
+      "\u001b[2m\u001b[36m(train pid=86249)\u001b[0m Using amp half precision backend\n",
+      "\u001b[2m\u001b[36m(train pid=86249)\u001b[0m The following columns in the test set  don't have a corresponding argument in `RobertaForMultipleChoice.forward` and have been ignored: fold-ind, sent2, gold-source, ending1, startphrase, sent1, ending0, video-id, ending2, ending3. If fold-ind, sent2, gold-source, ending1, startphrase, sent1, ending0, video-id, ending2, ending3 are not expected by `RobertaForMultipleChoice.forward`,  you can safely ignore this message.\n",
+      "\u001b[2m\u001b[36m(train pid=86249)\u001b[0m ***** Running Prediction *****\n",
+      "\u001b[2m\u001b[36m(train pid=86249)\u001b[0m   Num examples = 10000\n",
+      "\u001b[2m\u001b[36m(train pid=86249)\u001b[0m   Batch size = 16\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86195)\u001B[0m {'eval_loss': 0.49709731340408325, 'eval_automl_metric': 0.17600000000000005, 'eval_runtime': 15.4983, 'eval_samples_per_second': 645.232, 'eval_steps_per_second': 40.327, 'epoch': 2.0}\n"
+      "\u001b[2m\u001b[36m(train pid=86195)\u001b[0m {'eval_loss': 0.49709731340408325, 'eval_automl_metric': 0.17600000000000005, 'eval_runtime': 15.4983, 'eval_samples_per_second': 645.232, 'eval_steps_per_second': 40.327, 'epoch': 2.0}\n"
     ]
    },
    {
@@ -988,8 +990,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86195)\u001B[0m {'eval_loss': 0.5254333019256592, 'eval_automl_metric': 0.17800000000000005, 'eval_runtime': 15.45, 'eval_samples_per_second': 647.251, 'eval_steps_per_second': 40.453, 'epoch': 3.0}\n",
-      "\u001B[2m\u001B[36m(train pid=86195)\u001B[0m {'loss': 0.3989, 'learning_rate': 3.8051750127352887e-07, 'epoch': 3.0}\n"
+      "\u001b[2m\u001b[36m(train pid=86195)\u001b[0m {'eval_loss': 0.5254333019256592, 'eval_automl_metric': 0.17800000000000005, 'eval_runtime': 15.45, 'eval_samples_per_second': 647.251, 'eval_steps_per_second': 40.453, 'epoch': 3.0}\n",
+      "\u001b[2m\u001b[36m(train pid=86195)\u001b[0m {'loss': 0.3989, 'learning_rate': 3.8051750127352887e-07, 'epoch': 3.0}\n"
     ]
    },
    {
@@ -1003,9 +1005,9 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86195)\u001B[0m {'eval_loss': 0.5254867076873779, 'eval_automl_metric': 0.17789999999999995, 'eval_runtime': 15.424, 'eval_samples_per_second': 648.341, 'eval_steps_per_second': 40.521, 'epoch': 3.0}\n",
-      "\u001B[2m\u001B[36m(train pid=86195)\u001B[0m {'eval_loss': 0.5332269072532654, 'eval_automl_metric': 0.17830000000000001, 'eval_runtime': 15.4452, 'eval_samples_per_second': 647.45, 'eval_steps_per_second': 40.466, 'epoch': 3.39}\n",
-      "\u001B[2m\u001B[36m(train pid=86195)\u001B[0m {'train_runtime': 382.2827, 'train_samples_per_second': 88.597, 'train_steps_per_second': 11.076, 'train_loss': 0.5299136270370808, 'epoch': 3.39}\n"
+      "\u001b[2m\u001b[36m(train pid=86195)\u001b[0m {'eval_loss': 0.5254867076873779, 'eval_automl_metric': 0.17789999999999995, 'eval_runtime': 15.424, 'eval_samples_per_second': 648.341, 'eval_steps_per_second': 40.521, 'epoch': 3.0}\n",
+      "\u001b[2m\u001b[36m(train pid=86195)\u001b[0m {'eval_loss': 0.5332269072532654, 'eval_automl_metric': 0.17830000000000001, 'eval_runtime': 15.4452, 'eval_samples_per_second': 647.45, 'eval_steps_per_second': 40.466, 'epoch': 3.39}\n",
+      "\u001b[2m\u001b[36m(train pid=86195)\u001b[0m {'train_runtime': 382.2827, 'train_samples_per_second': 88.597, 'train_steps_per_second': 11.076, 'train_loss': 0.5299136270370808, 'epoch': 3.39}\n"
     ]
    },
    {
@@ -1013,11 +1015,11 @@
     "output_type": "stream",
     "text": [
      "2022-03-19 14:43:56,739\tWARNING ray_trial_executor.py:146 -- Skipping cleanup - trainable.stop did not return in time. Consider making `stop` a faster operation.\n",
-      "\u001B[2m\u001B[36m(train pid=86195)\u001B[0m Using amp half precision backend\n",
-      "\u001B[2m\u001B[36m(train pid=86195)\u001B[0m The following columns in the test set  don't have a corresponding argument in `RobertaForMultipleChoice.forward` and have been ignored: ending2, sent1, ending0, sent2, ending3, video-id, gold-source, ending1, startphrase, fold-ind. If ending2, sent1, ending0, sent2, ending3, video-id, gold-source, ending1, startphrase, fold-ind are not expected by `RobertaForMultipleChoice.forward`,  you can safely ignore this message.\n",
-      "\u001B[2m\u001B[36m(train pid=86195)\u001B[0m ***** Running Prediction *****\n",
-      "\u001B[2m\u001B[36m(train pid=86195)\u001B[0m   Num examples = 10000\n",
-      "\u001B[2m\u001B[36m(train pid=86195)\u001B[0m   Batch size = 16\n",
+      "\u001b[2m\u001b[36m(train pid=86195)\u001b[0m Using amp half precision backend\n",
+      "\u001b[2m\u001b[36m(train pid=86195)\u001b[0m The following columns in the test set  don't have a corresponding argument in `RobertaForMultipleChoice.forward` and have been ignored: ending2, sent1, ending0, sent2, ending3, video-id, gold-source, ending1, startphrase, fold-ind. If ending2, sent1, ending0, sent2, ending3, video-id, gold-source, ending1, startphrase, fold-ind are not expected by `RobertaForMultipleChoice.forward`,  you can safely ignore this message.\n",
+      "\u001b[2m\u001b[36m(train pid=86195)\u001b[0m ***** Running Prediction *****\n",
+      "\u001b[2m\u001b[36m(train pid=86195)\u001b[0m   Num examples = 10000\n",
+      "\u001b[2m\u001b[36m(train pid=86195)\u001b[0m   Batch size = 16\n",
      "2022-03-19 14:44:14,271\tINFO tune.py:639 -- Total run time: 795.18 seconds (504.18 seconds for the tuning loop).\n",
      "[flaml.automl: 03-19 14:44:19] {2837} INFO - selected model: None\n",
      "/data/installation/anaconda3/envs/tmp/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
@@ -1060,10 +1062,12 @@
    "automl_settings = {\n",
    "    \"time_budget\": 500,                 # setting the time budget\n",
    "    \"task\": \"multichoice-classification\",       # setting the task as multiplechoice-classification\n",
-    "    \"hf_args\": {\n",
-    "        \"output_dir\": \"data/output/\",  # setting the output directory\n",
-    "        \"ckpt_per_epoch\": 1,           # setting the number of checkoints per epoch\n",
-    "        \"per_device_eval_batch_size\": 16, # the batch size for validation (inference)\n",
+    "    \"fit_kwargs_by_estimator\": {          # if model_path is not set, the default model is facebook/muppet-roberta-base: https://huggingface.co/facebook/muppet-roberta-base\n",
+    "        \"transformer\": {\n",
+    "            \"output_dir\": \"data/output/\",  # setting the output directory\n",
+    "            \"ckpt_per_epoch\": 1,           # setting the number of checkoints per epoch\n",
+    "            \"per_device_eval_batch_size\": 16, # the batch size for validation (inference)\n",
+    "        }\n",
    "    },\n",
    "    \"gpu_per_trial\": 1,                 # set to 0 if no GPU is available\n",
    "    \"log_file_name\": \"seqclass.log\",    # set the file to save the log for HPO\n",
@@ -1213,37 +1217,37 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86232)\u001B[0m /data/installation/anaconda3/envs/tmp/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
-      "\u001B[2m\u001B[36m(train pid=86232)\u001B[0m   warnings.warn(\n"
+      "\u001b[2m\u001b[36m(train pid=86232)\u001b[0m /data/installation/anaconda3/envs/tmp/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "\u001b[2m\u001b[36m(train pid=86232)\u001b[0m   warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86232)\u001B[0m {'loss': 8.7635, 'learning_rate': 1.2308416834153697e-05, 'epoch': 0.11}\n"
+      "\u001b[2m\u001b[36m(train pid=86232)\u001b[0m {'loss': 8.7635, 'learning_rate': 1.2308416834153697e-05, 'epoch': 0.11}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86184)\u001B[0m /data/installation/anaconda3/envs/tmp/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
-      "\u001B[2m\u001B[36m(train pid=86184)\u001B[0m   warnings.warn(\n",
-      "\u001B[2m\u001B[36m(train pid=86225)\u001B[0m /data/installation/anaconda3/envs/tmp/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
-      "\u001B[2m\u001B[36m(train pid=86225)\u001B[0m   warnings.warn(\n",
-      "\u001B[2m\u001B[36m(train pid=86160)\u001B[0m /data/installation/anaconda3/envs/tmp/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
-      "\u001B[2m\u001B[36m(train pid=86160)\u001B[0m   warnings.warn(\n",
+      "\u001b[2m\u001b[36m(train pid=86184)\u001b[0m /data/installation/anaconda3/envs/tmp/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "\u001b[2m\u001b[36m(train pid=86184)\u001b[0m   warnings.warn(\n",
+      "\u001b[2m\u001b[36m(train pid=86225)\u001b[0m /data/installation/anaconda3/envs/tmp/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "\u001b[2m\u001b[36m(train pid=86225)\u001b[0m   warnings.warn(\n",
+      "\u001b[2m\u001b[36m(train pid=86160)\u001b[0m /data/installation/anaconda3/envs/tmp/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "\u001b[2m\u001b[36m(train pid=86160)\u001b[0m   warnings.warn(\n",
      "2022-03-19 14:56:00,679\tWARNING ray_trial_executor.py:146 -- Skipping cleanup - trainable.stop did not return in time. Consider making `stop` a faster operation.\n",
-      "\u001B[2m\u001B[36m(train pid=86232)\u001B[0m [nltk_data] Downloading package punkt to /home/xliu127/nltk_data...\n",
-      "\u001B[2m\u001B[36m(train pid=86232)\u001B[0m [nltk_data]   Package punkt is already up-to-date!\n"
+      "\u001b[2m\u001b[36m(train pid=86232)\u001b[0m [nltk_data] Downloading package punkt to /home/xliu127/nltk_data...\n",
+      "\u001b[2m\u001b[36m(train pid=86232)\u001b[0m [nltk_data]   Package punkt is already up-to-date!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86232)\u001B[0m {'eval_loss': 6.893245697021484, 'eval_automl_metric': 0.8537338408275918, 'eval_runtime': 102.2734, 'eval_samples_per_second': 110.801, 'eval_steps_per_second': 6.932, 'epoch': 0.11}\n"
+      "\u001b[2m\u001b[36m(train pid=86232)\u001b[0m {'eval_loss': 6.893245697021484, 'eval_automl_metric': 0.8537338408275918, 'eval_runtime': 102.2734, 'eval_samples_per_second': 110.801, 'eval_steps_per_second': 6.932, 'epoch': 0.11}\n"
     ]
    },
    {
@@ -1251,32 +1255,32 @@
     "output_type": "stream",
     "text": [
      "2022-03-19 14:57:00,687\tWARNING ray_trial_executor.py:146 -- Skipping cleanup - trainable.stop did not return in time. Consider making `stop` a faster operation.\n",
-      "\u001B[2m\u001B[36m(train pid=86184)\u001B[0m [nltk_data] Downloading package punkt to /home/xliu127/nltk_data...\n",
-      "\u001B[2m\u001B[36m(train pid=86184)\u001B[0m [nltk_data]   Package punkt is already up-to-date!\n"
+      "\u001b[2m\u001b[36m(train pid=86184)\u001b[0m [nltk_data] Downloading package punkt to /home/xliu127/nltk_data...\n",
+      "\u001b[2m\u001b[36m(train pid=86184)\u001b[0m [nltk_data]   Package punkt is already up-to-date!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86184)\u001B[0m {'eval_loss': 7.381210803985596, 'eval_automl_metric': 0.8475751825208984, 'eval_runtime': 107.4032, 'eval_samples_per_second': 105.509, 'eval_steps_per_second': 6.601, 'epoch': 0.16}\n"
+      "\u001b[2m\u001b[36m(train pid=86184)\u001b[0m {'eval_loss': 7.381210803985596, 'eval_automl_metric': 0.8475751825208984, 'eval_runtime': 107.4032, 'eval_samples_per_second': 105.509, 'eval_steps_per_second': 6.601, 'epoch': 0.16}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86160)\u001B[0m [nltk_data] Downloading package punkt to /home/xliu127/nltk_data...\n",
-      "\u001B[2m\u001B[36m(train pid=86160)\u001B[0m [nltk_data]   Package punkt is already up-to-date!\n",
-      "\u001B[2m\u001B[36m(train pid=86225)\u001B[0m [nltk_data] Downloading package punkt to /home/xliu127/nltk_data...\n",
-      "\u001B[2m\u001B[36m(train pid=86225)\u001B[0m [nltk_data]   Package punkt is already up-to-date!\n"
+      "\u001b[2m\u001b[36m(train pid=86160)\u001b[0m [nltk_data] Downloading package punkt to /home/xliu127/nltk_data...\n",
+      "\u001b[2m\u001b[36m(train pid=86160)\u001b[0m [nltk_data]   Package punkt is already up-to-date!\n",
+      "\u001b[2m\u001b[36m(train pid=86225)\u001b[0m [nltk_data] Downloading package punkt to /home/xliu127/nltk_data...\n",
+      "\u001b[2m\u001b[36m(train pid=86225)\u001b[0m [nltk_data]   Package punkt is already up-to-date!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86160)\u001B[0m {'eval_loss': 10.150897979736328, 'eval_automl_metric': 0.8566791839938478, 'eval_runtime': 108.2143, 'eval_samples_per_second': 104.718, 'eval_steps_per_second': 6.552, 'epoch': 0.36}\n"
+      "\u001b[2m\u001b[36m(train pid=86160)\u001b[0m {'eval_loss': 10.150897979736328, 'eval_automl_metric': 0.8566791839938478, 'eval_runtime': 108.2143, 'eval_samples_per_second': 104.718, 'eval_steps_per_second': 6.552, 'epoch': 0.36}\n"
     ]
    },
    {
@@ -1290,23 +1294,23 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86225)\u001B[0m {'eval_loss': 11.665904998779297, 'eval_automl_metric': 0.858011676038827, 'eval_runtime': 109.4667, 'eval_samples_per_second': 103.52, 'eval_steps_per_second': 6.477, 'epoch': 0.38}\n"
+      "\u001b[2m\u001b[36m(train pid=86225)\u001b[0m {'eval_loss': 11.665904998779297, 'eval_automl_metric': 0.858011676038827, 'eval_runtime': 109.4667, 'eval_samples_per_second': 103.52, 'eval_steps_per_second': 6.477, 'epoch': 0.38}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86232)\u001B[0m [nltk_data] Downloading package punkt to /home/xliu127/nltk_data...\n",
-      "\u001B[2m\u001B[36m(train pid=86232)\u001B[0m [nltk_data]   Package punkt is already up-to-date!\n"
+      "\u001b[2m\u001b[36m(train pid=86232)\u001b[0m [nltk_data] Downloading package punkt to /home/xliu127/nltk_data...\n",
+      "\u001b[2m\u001b[36m(train pid=86232)\u001b[0m [nltk_data]   Package punkt is already up-to-date!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86232)\u001B[0m {'eval_loss': 6.893245697021484, 'eval_automl_metric': 0.8537338408275918, 'eval_runtime': 110.7246, 'eval_samples_per_second': 102.344, 'eval_steps_per_second': 6.403, 'epoch': 0.11}\n",
-      "\u001B[2m\u001B[36m(train pid=86232)\u001B[0m {'train_runtime': 220.8946, 'train_samples_per_second': 4.648, 'train_steps_per_second': 0.149, 'train_loss': 8.763471198804451, 'epoch': 0.11}\n"
+      "\u001b[2m\u001b[36m(train pid=86232)\u001b[0m {'eval_loss': 6.893245697021484, 'eval_automl_metric': 0.8537338408275918, 'eval_runtime': 110.7246, 'eval_samples_per_second': 102.344, 'eval_steps_per_second': 6.403, 'epoch': 0.11}\n",
+      "\u001b[2m\u001b[36m(train pid=86232)\u001b[0m {'train_runtime': 220.8946, 'train_samples_per_second': 4.648, 'train_steps_per_second': 0.149, 'train_loss': 8.763471198804451, 'epoch': 0.11}\n"
     ]
    },
    {
@@ -1314,40 +1318,40 @@
     "output_type": "stream",
     "text": [
      "2022-03-19 14:59:00,706\tWARNING ray_trial_executor.py:146 -- Skipping cleanup - trainable.stop did not return in time. Consider making `stop` a faster operation.\n",
-      "\u001B[2m\u001B[36m(train pid=86232)\u001B[0m Using amp half precision backend\n",
-      "\u001B[2m\u001B[36m(train pid=86232)\u001B[0m ***** Running Prediction *****\n",
-      "\u001B[2m\u001B[36m(train pid=86232)\u001B[0m   Num examples = 11332\n",
-      "\u001B[2m\u001B[36m(train pid=86232)\u001B[0m   Batch size = 16\n",
-      "\u001B[2m\u001B[36m(train pid=86184)\u001B[0m [nltk_data] Downloading package punkt to /home/xliu127/nltk_data...\n",
-      "\u001B[2m\u001B[36m(train pid=86184)\u001B[0m [nltk_data]   Package punkt is already up-to-date!\n",
-      "\u001B[2m\u001B[36m(train pid=86160)\u001B[0m [nltk_data] Downloading package punkt to /home/xliu127/nltk_data...\n",
-      "\u001B[2m\u001B[36m(train pid=86160)\u001B[0m [nltk_data]   Package punkt is already up-to-date!\n"
+      "\u001b[2m\u001b[36m(train pid=86232)\u001b[0m Using amp half precision backend\n",
+      "\u001b[2m\u001b[36m(train pid=86232)\u001b[0m ***** Running Prediction *****\n",
+      "\u001b[2m\u001b[36m(train pid=86232)\u001b[0m   Num examples = 11332\n",
+      "\u001b[2m\u001b[36m(train pid=86232)\u001b[0m   Batch size = 16\n",
+      "\u001b[2m\u001b[36m(train pid=86184)\u001b[0m [nltk_data] Downloading package punkt to /home/xliu127/nltk_data...\n",
+      "\u001b[2m\u001b[36m(train pid=86184)\u001b[0m [nltk_data]   Package punkt is already up-to-date!\n",
+      "\u001b[2m\u001b[36m(train pid=86160)\u001b[0m [nltk_data] Downloading package punkt to /home/xliu127/nltk_data...\n",
+      "\u001b[2m\u001b[36m(train pid=86160)\u001b[0m [nltk_data]   Package punkt is already up-to-date!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86184)\u001B[0m {'eval_loss': 7.381210803985596, 'eval_automl_metric': 0.8475751825208984, 'eval_runtime': 109.1975, 'eval_samples_per_second': 103.775, 'eval_steps_per_second': 6.493, 'epoch': 0.16}\n"
+      "\u001b[2m\u001b[36m(train pid=86184)\u001b[0m {'eval_loss': 7.381210803985596, 'eval_automl_metric': 0.8475751825208984, 'eval_runtime': 109.1975, 'eval_samples_per_second': 103.775, 'eval_steps_per_second': 6.493, 'epoch': 0.16}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86225)\u001B[0m [nltk_data] Downloading package punkt to /home/xliu127/nltk_data...\n",
-      "\u001B[2m\u001B[36m(train pid=86225)\u001B[0m [nltk_data]   Package punkt is already up-to-date!\n"
+      "\u001b[2m\u001b[36m(train pid=86225)\u001b[0m [nltk_data] Downloading package punkt to /home/xliu127/nltk_data...\n",
+      "\u001b[2m\u001b[36m(train pid=86225)\u001b[0m [nltk_data]   Package punkt is already up-to-date!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "\u001B[2m\u001B[36m(train pid=86184)\u001B[0m {'train_runtime': 232.9303, 'train_samples_per_second': 10.067, 'train_steps_per_second': 1.262, 'train_loss': 9.880440506280637, 'epoch': 0.16}\n",
-      "\u001B[2m\u001B[36m(train pid=86160)\u001B[0m {'eval_loss': 10.150897979736328, 'eval_automl_metric': 0.8566791839938478, 'eval_runtime': 108.3182, 'eval_samples_per_second': 104.618, 'eval_steps_per_second': 6.546, 'epoch': 0.36}\n",
-      "\u001B[2m\u001B[36m(train pid=86160)\u001B[0m {'train_runtime': 232.4568, 'train_samples_per_second': 92.218, 'train_steps_per_second': 2.887, 'train_loss': 11.215172903878349, 'epoch': 0.36}\n",
-      "\u001B[2m\u001B[36m(train pid=86225)\u001B[0m {'eval_loss': 11.665904998779297, 'eval_automl_metric': 0.858011676038827, 'eval_runtime': 110.526, 'eval_samples_per_second': 102.528, 'eval_steps_per_second': 6.415, 'epoch': 0.38}\n",
-      "\u001B[2m\u001B[36m(train pid=86225)\u001B[0m {'train_runtime': 236.6253, 'train_samples_per_second': 19.714, 'train_steps_per_second': 0.621, 'train_loss': 11.549961930614407, 'epoch': 0.38}\n"
+      "\u001b[2m\u001b[36m(train pid=86184)\u001b[0m {'train_runtime': 232.9303, 'train_samples_per_second': 10.067, 'train_steps_per_second': 1.262, 'train_loss': 9.880440506280637, 'epoch': 0.16}\n",
+      "\u001b[2m\u001b[36m(train pid=86160)\u001b[0m {'eval_loss': 10.150897979736328, 'eval_automl_metric': 0.8566791839938478, 'eval_runtime': 108.3182, 'eval_samples_per_second': 104.618, 'eval_steps_per_second': 6.546, 'epoch': 0.36}\n",
+      "\u001b[2m\u001b[36m(train pid=86160)\u001b[0m {'train_runtime': 232.4568, 'train_samples_per_second': 92.218, 'train_steps_per_second': 2.887, 'train_loss': 11.215172903878349, 'epoch': 0.36}\n",
+      "\u001b[2m\u001b[36m(train pid=86225)\u001b[0m {'eval_loss': 11.665904998779297, 'eval_automl_metric': 0.858011676038827, 'eval_runtime': 110.526, 'eval_samples_per_second': 102.528, 'eval_steps_per_second': 6.415, 'epoch': 0.38}\n",
+      "\u001b[2m\u001b[36m(train pid=86225)\u001b[0m {'train_runtime': 236.6253, 'train_samples_per_second': 19.714, 'train_steps_per_second': 0.621, 'train_loss': 11.549961930614407, 'epoch': 0.38}\n"
     ]
    },
    {
@@ -1355,18 +1359,18 @@
     "output_type": "stream",
     "text": [
      "2022-03-19 15:00:00,942\tWARNING ray_trial_executor.py:146 -- Skipping cleanup - trainable.stop did not return in time. Consider making `stop` a faster operation.\n",
-      "\u001B[2m\u001B[36m(train pid=86184)\u001B[0m Using amp half precision backend\n",
-      "\u001B[2m\u001B[36m(train pid=86184)\u001B[0m ***** Running Prediction *****\n",
-      "\u001B[2m\u001B[36m(train pid=86184)\u001B[0m   Num examples = 11332\n",
-      "\u001B[2m\u001B[36m(train pid=86184)\u001B[0m   Batch size = 16\n",
-      "\u001B[2m\u001B[36m(train pid=86160)\u001B[0m Using amp half precision backend\n",
-      "\u001B[2m\u001B[36m(train pid=86160)\u001B[0m ***** Running Prediction *****\n",
-      "\u001B[2m\u001B[36m(train pid=86160)\u001B[0m   Num examples = 11332\n",
-      "\u001B[2m\u001B[36m(train pid=86160)\u001B[0m   Batch size = 16\n",
-      "\u001B[2m\u001B[36m(train pid=86225)\u001B[0m Using amp half precision backend\n",
-      "\u001B[2m\u001B[36m(train pid=86225)\u001B[0m ***** Running Prediction *****\n",
-      "\u001B[2m\u001B[36m(train pid=86225)\u001B[0m   Num examples = 11332\n",
-      "\u001B[2m\u001B[36m(train pid=86225)\u001B[0m   Batch size = 16\n",
+      "\u001b[2m\u001b[36m(train pid=86184)\u001b[0m Using amp half precision backend\n",
+      "\u001b[2m\u001b[36m(train pid=86184)\u001b[0m ***** Running Prediction *****\n",
+      "\u001b[2m\u001b[36m(train pid=86184)\u001b[0m   Num examples = 11332\n",
+      "\u001b[2m\u001b[36m(train pid=86184)\u001b[0m   Batch size = 16\n",
+      "\u001b[2m\u001b[36m(train pid=86160)\u001b[0m Using amp half precision backend\n",
+      "\u001b[2m\u001b[36m(train pid=86160)\u001b[0m ***** Running Prediction *****\n",
+      "\u001b[2m\u001b[36m(train pid=86160)\u001b[0m   Num examples = 11332\n",
+      "\u001b[2m\u001b[36m(train pid=86160)\u001b[0m   Batch size = 16\n",
+      "\u001b[2m\u001b[36m(train pid=86225)\u001b[0m Using amp half precision backend\n",
+      "\u001b[2m\u001b[36m(train pid=86225)\u001b[0m ***** Running Prediction *****\n",
+      "\u001b[2m\u001b[36m(train pid=86225)\u001b[0m   Num examples = 11332\n",
+      "\u001b[2m\u001b[36m(train pid=86225)\u001b[0m   Batch size = 16\n",
      "2022-03-19 15:01:00,948\tWARNING ray_trial_executor.py:146 -- Skipping cleanup - trainable.stop did not return in time. Consider making `stop` a faster operation.\n",
      "2022-03-19 15:02:20,150\tINFO tune.py:639 -- Total run time: 950.87 seconds (500.36 seconds for the tuning loop).\n",
      "[flaml.automl: 03-19 15:02:25] {2837} INFO - selected model: None\n",
@@ -1399,11 +1403,13 @@
    "automl_settings = {\n",
    "    \"time_budget\": 500,         # setting the time budget\n",
    "    \"task\": \"summarization\",    # setting the task as summarization\n",
-    "    \"hf_args\": {\n",
-    "        \"output_dir\": \"data/output/\",  # setting the output directory\n",
-    "        \"ckpt_per_epoch\": 1,    # setting the number of checkoints per epoch\n",
-    "        \"model_path\": \"t5-small\",\n",
-    "        \"per_device_eval_batch_size\": 16,  # the batch size for validation (inference)\n",
+    "    \"fit_kwargs_by_estimator\": {  # if model_path is not set, the default model is t5-small: https://huggingface.co/t5-small\n",
+    "        \"transformer\": {\n",
+    "            \"output_dir\": \"data/output/\",  # setting the output directory\n",
+    "            \"ckpt_per_epoch\": 1,    # setting the number of checkoints per epoch\n",
+    "            \"model_path\": \"t5-small\",\n",
+    "            \"per_device_eval_batch_size\": 16,  # the batch size for validation (inference)\n",
+    "        }\n",
    "    },\n",
    "    \"gpu_per_trial\": 1,  # set to 0 if no GPU is available\n",
    "    \"log_file_name\": \"seqclass.log\",  # set the file to save the log for HPO\n",
@@ -1519,4 +1525,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 2
-}
+}
--- a/test/load_args.py
+++ b/test/load_args.py
@@ -1,7 +1,7 @@
 def test_load_args_sub():
-    from flaml.nlp.utils import HFArgs
+    from flaml.nlp.utils import TrainingArgumentsForAuto

-    HFArgs.load_args_from_console()
+    TrainingArgumentsForAuto.load_args_from_console()


 if __name__ == "__main__":
--- a/test/nlp/default/init.py
+++ b/test/nlp/default/init.py
--- a/test/nlp/default/all/metafeatures.csv
+++ b/test/nlp/default/all/metafeatures.csv
@@ -0,0 +1,6 @@
+Dataset,NumberOfInstances
+glue-rte-,2500
+glue-mrpc-,3700
+glue-cola-,8500
+glue-qnli-,105000
+glue-sst2-,67000
--- a/test/nlp/default/all/metafeatures_err.csv
+++ b/test/nlp/default/all/metafeatures_err.csv
@@ -0,0 +1,6 @@
+Dataset,NonExisting
+glue-rte-,2500
+glue-mrpc-,3700
+glue-cola-,8500
+glue-qnli-,105000
+glue-sst2-,67000
--- a/test/nlp/default/transformer_ms/glue-cola-.json
+++ b/test/nlp/default/transformer_ms/glue-cola-.json
@@ -0,0 +1,5 @@
+{"class": "transformer_ms",
+  "hyperparameters": {"learning_rate":  1e-5, "num_train_epochs":  1.0, "per_device_train_batch_size":  8,
+  "warmup_ratio":  0.0, "weight_decay":  0.0, "adam_epsilon":  1e-6, "seed":  44, "global_max_steps":  101,
+  "model_path": "google/electra-base-discriminator"}
+}
--- a/test/nlp/default/transformer_ms/glue-mrpc-.json
+++ b/test/nlp/default/transformer_ms/glue-mrpc-.json
@@ -0,0 +1,5 @@
+{"class": "transformer_ms",
+  "hyperparameters": {"learning_rate":  1e-5, "num_train_epochs":  1.0, "per_device_train_batch_size":  8,
+  "warmup_ratio":  0.0, "weight_decay":  0.0, "adam_epsilon":  1e-6, "seed":  43, "global_max_steps":  100,
+  "model_path": "google/electra-base-discriminator"}
+}
--- a/test/nlp/default/transformer_ms/glue-qnli-.json
+++ b/test/nlp/default/transformer_ms/glue-qnli-.json
@@ -0,0 +1,5 @@
+{"class": "transformer_ms",
+  "hyperparameters": {"learning_rate":  1e-5, "num_train_epochs":  1.0, "per_device_train_batch_size":  8,
+  "warmup_ratio":  0.0, "weight_decay":  0.0, "adam_epsilon":  1e-6, "seed":  41, "global_max_steps":  102,
+  "model_path": "google/electra-base-discriminator" }
+}
--- a/test/nlp/default/transformer_ms/glue-rte-.json
+++ b/test/nlp/default/transformer_ms/glue-rte-.json
@@ -0,0 +1,5 @@
+{"class": "transformer_ms",
+  "hyperparameters": {"learning_rate":  1e-5, "num_train_epochs":  1.0, "per_device_train_batch_size":  8,
+  "warmup_ratio":  0.0, "weight_decay":  0.0, "adam_epsilon":  1e-6, "seed":  42, "global_max_steps":  103,
+  "model_path": "google/electra-base-discriminator" }
+}
--- a/test/nlp/default/transformer_ms/glue-sst2-.json
+++ b/test/nlp/default/transformer_ms/glue-sst2-.json
@@ -0,0 +1,5 @@
+{"class": "transformer_ms",
+  "hyperparameters": {"learning_rate":  1e-5, "num_train_epochs":  1.0, "per_device_train_batch_size":  8,
+  "warmup_ratio":  0.0, "weight_decay":  0.0, "adam_epsilon":  1e-6, "seed":  40, "global_max_steps":  105,
+    "model_path": "google/electra-base-discriminator"}
+}
--- a/test/nlp/default/transformer_ms/results.csv
+++ b/test/nlp/default/transformer_ms/results.csv
@@ -0,0 +1,26 @@
+task,fold,type,result,params
+glue-rte-,0,seq-classification,0.946366,{'_modeljson': 'transformer_ms/glue-rte-.json'}
+glue-rte-,0,seq-classification,0.957774,{'_modeljson': 'transformer_ms/glue-mrpc-.json'}
+glue-rte-,0,seq-classification,0.901643,{'_modeljson': 'transformer_ms/glue-cola-.json'}
+glue-rte-,0,seq-classification,0.915098,{'_modeljson': 'transformer_ms/glue-qnli-.json'}
+glue-rte-,0,seq-classification,0.302328,{'_modeljson': 'transformer_ms/glue-sst2-.json'}
+glue-mrpc-,0,seq-classification,0.937203,{'_modeljson': 'transformer_ms/glue-rte-.json'}
+glue-mrpc-,0,seq-classification,0.932072,{'_modeljson': 'transformer_ms/glue-mrpc-.json'}
+glue-mrpc-,0,seq-classification,0.926563,{'_modeljson': 'transformer_ms/glue-cola-.json'}
+glue-mrpc-,0,seq-classification,0.928604,{'_modeljson': 'transformer_ms/glue-qnli-.json'}
+glue-mrpc-,0,seq-classification,0.911171,{'_modeljson': 'transformer_ms/glue-sst2-.json'}
+glue-cola-,0,seq-classification,0.705404,{'_modeljson': 'transformer_ms/glue-rte-.json'}
+glue-cola-,0,seq-classification,0.714521,{'_modeljson': 'transformer_ms/glue-mrpc-.json'}
+glue-cola-,0,seq-classification,0.732288,{'_modeljson': 'transformer_ms/glue-cola-.json'}
+glue-cola-,0,seq-classification,0.710273,{'_modeljson': 'transformer_ms/glue-qnli-.json'}
+glue-cola-,0,seq-classification,0.707107,{'_modeljson': 'transformer_ms/glue-sst2-.json'}
+glue-qnli-,0,seq-classification,0.744825,{'_modeljson': 'transformer_ms/glue-rte-.json'}
+glue-qnli-,0,seq-classification,0.758979,{'_modeljson': 'transformer_ms/glue-mrpc-.json'}
+glue-qnli-,0,seq-classification,0.758364,{'_modeljson': 'transformer_ms/glue-cola-.json'}
+glue-qnli-,0,seq-classification,0.770923,{'_modeljson': 'transformer_ms/glue-qnli-.json'}
+glue-qnli-,0,seq-classification,0.745091,{'_modeljson': 'transformer_ms/glue-sst2-.json'}
+glue-sst2-,0,seq-regression,0.754523,{'_modeljson': 'transformer_ms/glue-rte-.json'}
+glue-sst2-,0,seq-regression,0.759939,{'_modeljson': 'transformer_ms/glue-mrpc-.json'}
+glue-sst2-,0,seq-regression,0.765119,{'_modeljson': 'transformer_ms/glue-cola-.json'}
+glue-sst2-,0,seq-regression,0.745067,{'_modeljson': 'transformer_ms/glue-qnli-.json'}
+glue-sst2-,0,seq-regression,0.762311,{'_modeljson': 'transformer_ms/glue-sst2-.json'}
--- a/test/nlp/test_autohf.py
+++ b/test/nlp/test_autohf.py
@@ -1,97 +1,18 @@
 import sys
 import pytest
-import pickle
 import requests
+from utils import get_toy_data_seqclassification, get_automl_settings


@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
 def test_hf_data():
    from flaml import AutoML
-    import pandas as pd

-    train_data = {
-        "sentence1": [
-            'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
-            "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
-            "They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .",
-            "Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .",
-        ],
-        "sentence2": [
-            'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
-            "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
-            "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
-            "Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .",
-        ],
-        "label": [1, 0, 1, 0],
-        "idx": [0, 1, 2, 3],
-    }
-    train_dataset = pd.DataFrame(train_data)
-
-    dev_data = {
-        "sentence1": [
-            "The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .",
-            "Revenue in the first quarter of the year dropped 15 percent from the same period a year earlier .",
-            "The Nasdaq had a weekly gain of 17.27 , or 1.2 percent , closing at 1,520.15 on Friday .",
-            "The DVD-CCA then appealed to the state Supreme Court .",
-        ],
-        "sentence2": [
-            "PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .",
-            "With the scandal hanging over Stewart 's company , revenue the first quarter of the year dropped 15 percent from the same period a year earlier .",
-            "The tech-laced Nasdaq Composite .IXIC rallied 30.46 points , or 2.04 percent , to 1,520.15 .",
-            "The DVD CCA appealed that decision to the U.S. Supreme Court .",
-        ],
-        "label": [1, 1, 0, 1],
-        "idx": [4, 5, 6, 7],
-    }
-    dev_dataset = pd.DataFrame(dev_data)
-
-    test_data = {
-        "sentence1": [
-            "That compared with $ 35.18 million , or 24 cents per share , in the year-ago period .",
-            "Shares of Genentech , a much larger company with several products on the market , rose more than 2 percent .",
-            "Legislation making it harder for consumers to erase their debts in bankruptcy court won overwhelming House approval in March .",
-            "The Nasdaq composite index increased 10.73 , or 0.7 percent , to 1,514.77 .",
-        ],
-        "sentence2": [
-            "Earnings were affected by a non-recurring $ 8 million tax benefit in the year-ago period .",
-            "Shares of Xoma fell 16 percent in early trade , while shares of Genentech , a much larger company with several products on the market , were up 2 percent .",
-            "Legislation making it harder for consumers to erase their debts in bankruptcy court won speedy , House approval in March and was endorsed by the White House .",
-            "The Nasdaq Composite index , full of technology stocks , was lately up around 18 points .",
-        ],
-        "label": [0, 0, 0, 0],
-        "idx": [8, 10, 11, 12],
-    }
-    test_dataset = pd.DataFrame(test_data)
-
-    custom_sent_keys = ["sentence1", "sentence2"]
-    label_key = "label"
-
-    X_train = train_dataset[custom_sent_keys]
-    y_train = train_dataset[label_key]
-
-    X_val = dev_dataset[custom_sent_keys]
-    y_val = dev_dataset[label_key]
-
-    X_test = test_dataset[custom_sent_keys]
+    X_train, y_train, X_val, y_val, X_test = get_toy_data_seqclassification()

    automl = AutoML()

-    automl_settings = {
-        "gpu_per_trial": 0,
-        "max_iter": 3,
-        "time_budget": 10,
-        "task": "seq-classification",
-        "metric": "accuracy",
-        "log_file_name": "seqclass.log",
-        "use_ray": False,
-    }
-
-    automl_settings["hf_args"] = {
-        "model_path": "google/electra-small-discriminator",
-        "output_dir": "test/data/output/",
-        "ckpt_per_epoch": 5,
-        "fp16": False,
-    }
+    automl_settings = get_automl_settings()

    try:
        automl.fit(
@@ -107,6 +28,11 @@ def test_hf_data():
        return

    automl = AutoML()
+
+    automl_settings.pop("max_iter", None)
+    automl_settings.pop("use_ray", None)
+    automl_settings.pop("estimator_list", None)
+
    automl.retrain_from_log(
        X_train=X_train,
        y_train=y_train,
@@ -128,66 +54,5 @@ def test_hf_data():
    print(automl.classes_)


-def _test_custom_data():
-    from flaml import AutoML
-    import requests
-    import pandas as pd
-
-    try:
-        train_dataset = pd.read_csv("data/input/train.tsv", delimiter="\t", quoting=3)
-        dev_dataset = pd.read_csv("data/input/dev.tsv", delimiter="\t", quoting=3)
-        test_dataset = pd.read_csv("data/input/test.tsv", delimiter="\t", quoting=3)
-    except requests.exceptions.HTTPError:
-        return
-
-    custom_sent_keys = ["#1 String", "#2 String"]
-    label_key = "Quality"
-
-    X_train = train_dataset[custom_sent_keys]
-    y_train = train_dataset[label_key]
-
-    X_val = dev_dataset[custom_sent_keys]
-    y_val = dev_dataset[label_key]
-
-    X_test = test_dataset[custom_sent_keys]
-
-    automl = AutoML()
-
-    automl_settings = {
-        "gpu_per_trial": 0,
-        "max_iter": 3,
-        "time_budget": 5,
-        "task": "seq-classification",
-        "metric": "accuracy",
-    }
-
-    automl_settings["hf_args"] = {
-        "model_path": "google/electra-small-discriminator",
-        "output_dir": "data/output/",
-        "ckpt_per_epoch": 1,
-    }
-
-    automl.fit(
-        X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
-    )
-    automl.predict(X_test)
-    automl.predict(["test test"])
-    automl.predict(
-        [
-            ["test test", "test test"],
-            ["test test", "test test"],
-            ["test test", "test test"],
-        ]
-    )
-
-    automl.pickle("automl.pkl")
-
-    with open("automl.pkl", "rb") as f:
-        automl = pickle.load(f)
-    config = automl.best_config.copy()
-    config["learner"] = automl.best_estimator
-    automl.trainable(config)
-
-
 if __name__ == "__main__":
    test_hf_data()
--- a/test/nlp/test_autohf_classificationhead.py
+++ b/test/nlp/test_autohf_classificationhead.py
@@ -1,63 +1,14 @@
+from utils import get_toy_data_multiclassclassification, get_automl_settings
+
+
 def test_classification_head():
    from flaml import AutoML
-    import pandas as pd
    import requests

-    train_data = {
-        "text": [
-            "i didnt feel humiliated",
-            "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake",
-            "im grabbing a minute to post i feel greedy wrong",
-            "i am ever feeling nostalgic about the fireplace i will know that it is still on the property",
-            "i am feeling grouchy",
-            "ive been feeling a little burdened lately wasnt sure why that was",
-            "ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny",
-            "i feel as confused about life as a teenager or as jaded as a year old man",
-            "i have been with petronas for years i feel that petronas has performed well and made a huge profit",
-            "i feel romantic too",
-            "i feel like i have to make the suffering i m seeing mean something",
-            "i do feel that running is a divine experience and that i can expect to have some type of spiritual encounter",
-        ],
-        "label": [0, 0, 3, 2, 3, 0, 5, 4, 1, 2, 0, 1],
-    }
-    train_dataset = pd.DataFrame(train_data)
-
-    dev_data = {
-        "text": [
-            "i think it s the easiest time of year to feel dissatisfied",
-            "i feel low energy i m just thirsty",
-            "i have immense sympathy with the general point but as a possible proto writer trying to find time to write in the corners of life and with no sign of an agent let alone a publishing contract this feels a little precious",
-            "i do not feel reassured anxiety is on each side",
-        ],
-        "label": [3, 0, 1, 1],
-    }
-    dev_dataset = pd.DataFrame(dev_data)
-
-    custom_sent_keys = ["text"]
-    label_key = "label"
-
-    X_train = train_dataset[custom_sent_keys]
-    y_train = train_dataset[label_key]
-
-    X_val = dev_dataset[custom_sent_keys]
-    y_val = dev_dataset[label_key]
-
+    X_train, y_train, X_val, y_val = get_toy_data_multiclassclassification()
    automl = AutoML()

-    automl_settings = {
-        "gpu_per_trial": 0,
-        "max_iter": 3,
-        "time_budget": 5,
-        "task": "seq-classification",
-        "metric": "accuracy",
-    }
-
-    automl_settings["hf_args"] = {
-        "model_path": "google/electra-small-discriminator",
-        "output_dir": "test/data/output/",
-        "ckpt_per_epoch": 1,
-        "fp16": False,
-    }
+    automl_settings = get_automl_settings()

    try:
        automl.fit(
--- a/test/nlp/test_autohf_custom_metric.py
+++ b/test/nlp/test_autohf_custom_metric.py
@@ -1,5 +1,6 @@
 import sys
 import pytest
+from utils import get_toy_data_seqclassification, get_automl_settings


 def custom_metric(
@@ -19,7 +20,7 @@ def custom_metric(
    from flaml.model import TransformersEstimator

    if estimator._trainer is None:
-        trainer, _ = estimator._init_model_for_predict()
+        trainer = estimator._init_model_for_predict()
        estimator._trainer = None
    else:
        trainer = estimator._trainer
@@ -41,58 +42,11 @@ def custom_metric(
@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
 def test_custom_metric():
    from flaml import AutoML
-    import pandas as pd
    import requests

-    train_data = {
-        "sentence1": [
-            'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
-            "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
-            "They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .",
-            "Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .",
-        ],
-        "sentence2": [
-            'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
-            "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
-            "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
-            "Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .",
-        ],
-        "label": [1, 0, 1, 0],
-        "idx": [0, 1, 2, 3],
-    }
-    train_dataset = pd.DataFrame(train_data)
-
-    dev_data = {
-        "sentence1": [
-            "The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .",
-            "Revenue in the first quarter of the year dropped 15 percent from the same period a year earlier .",
-            "The Nasdaq had a weekly gain of 17.27 , or 1.2 percent , closing at 1,520.15 on Friday .",
-            "The DVD-CCA then appealed to the state Supreme Court .",
-        ],
-        "sentence2": [
-            "PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .",
-            "With the scandal hanging over Stewart 's company , revenue the first quarter of the year dropped 15 percent from the same period a year earlier .",
-            "The tech-laced Nasdaq Composite .IXIC rallied 30.46 points , or 2.04 percent , to 1,520.15 .",
-            "The DVD CCA appealed that decision to the U.S. Supreme Court .",
-        ],
-        "label": [1, 1, 0, 1],
-        "idx": [4, 5, 6, 7],
-    }
-    dev_dataset = pd.DataFrame(dev_data)
-
-    custom_sent_keys = ["sentence1", "sentence2"]
-    label_key = "label"
-
-    X_train = train_dataset[custom_sent_keys]
-    y_train = train_dataset[label_key]
-
-    X_val = dev_dataset[custom_sent_keys]
-    y_val = dev_dataset[label_key]
-
+    X_train, y_train, X_val, y_val, X_test = get_toy_data_seqclassification()
    automl = AutoML()

-    # testing when max_iter=1 and do retrain only without hpo
-
    try:
        import ray

@@ -101,22 +55,9 @@ def test_custom_metric():
    except ImportError:
        return

-    automl_settings = {
-        "gpu_per_trial": 0,
-        "max_iter": 1,
-        "time_budget": 5,
-        "task": "seq-classification",
-        "metric": custom_metric,
-        "log_file_name": "seqclass.log",
-        "use_ray": {"local_dir": "data/outut/"},
-    }
-
-    automl_settings["hf_args"] = {
-        "model_path": "google/electra-small-discriminator",
-        "output_dir": "data/output/",
-        "ckpt_per_epoch": 1,
-        "fp16": False,
-    }
+    automl_settings = get_automl_settings()
+    automl_settings["metric"] = custom_metric
+    automl_settings["use_ray"] = {"local_dir": "data/output/"}

    try:
        automl.fit(
--- a/test/nlp/test_autohf_cv.py
+++ b/test/nlp/test_autohf_cv.py
@@ -1,54 +1,18 @@
 import sys
 import pytest
+from utils import get_toy_data_seqclassification, get_automl_settings


@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
 def test_cv():
    from flaml import AutoML
-    import pandas as pd
    import requests

-    train_data = {
-        "sentence1": [
-            'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
-            "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
-            "They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .",
-            "Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .",
-        ],
-        "sentence2": [
-            'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
-            "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
-            "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
-            "Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .",
-        ],
-        "label": [1, 0, 1, 0],
-        "idx": [0, 1, 2, 3],
-    }
-    train_dataset = pd.DataFrame(train_data)
-
-    custom_sent_keys = ["sentence1", "sentence2"]
-    label_key = "label"
-
-    X_train = train_dataset[custom_sent_keys]
-    y_train = train_dataset[label_key]
-
+    X_train, y_train, X_val, y_val, X_test = get_toy_data_seqclassification()
    automl = AutoML()

-    automl_settings = {
-        "gpu_per_trial": 0,
-        "max_iter": 3,
-        "time_budget": 5,
-        "task": "seq-classification",
-        "metric": "accuracy",
-        "n_splits": 3,
-    }
-
-    automl_settings["hf_args"] = {
-        "model_path": "google/electra-small-discriminator",
-        "output_dir": "test/data/output/",
-        "ckpt_per_epoch": 1,
-        "fp16": False,
-    }
+    automl_settings = get_automl_settings()
+    automl_settings["n_splits"] = 3

    try:
        automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
--- a/test/nlp/test_autohf_multichoice_classification.py
+++ b/test/nlp/test_autohf_multichoice_classification.py
@@ -1,227 +1,26 @@
 import sys
 import pytest
+from utils import get_toy_data_multiplechoiceclassification, get_automl_settings


@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
 def test_mcc():
    from flaml import AutoML
    import requests
-    import pandas as pd

-    train_data = {
-        "video-id": [
-            "anetv_fruimvo90vA",
-            "anetv_fruimvo90vA",
-            "anetv_fruimvo90vA",
-            "anetv_MldEr60j33M",
-            "lsmdc0049_Hannah_and_her_sisters-69438",
-        ],
-        "fold-ind": ["10030", "10030", "10030", "5488", "17405"],
-        "startphrase": [
-            "A woman is seen running down a long track and jumping into a pit. The camera",
-            "A woman is seen running down a long track and jumping into a pit. The camera",
-            "A woman is seen running down a long track and jumping into a pit. The camera",
-            "A man in a white shirt bends over and picks up a large weight. He",
-            "Someone furiously shakes someone away. He",
-        ],
-        "sent1": [
-            "A woman is seen running down a long track and jumping into a pit.",
-            "A woman is seen running down a long track and jumping into a pit.",
-            "A woman is seen running down a long track and jumping into a pit.",
-            "A man in a white shirt bends over and picks up a large weight.",
-            "Someone furiously shakes someone away.",
-        ],
-        "sent2": ["The camera", "The camera", "The camera", "He", "He"],
-        "gold-source": ["gen", "gen", "gold", "gen", "gold"],
-        "ending0": [
-            "captures her as well as lifting weights down in place.",
-            "follows her spinning her body around and ends by walking down a lane.",
-            "watches her as she walks away and sticks her tongue out to another person.",
-            "lifts the weights over his head.",
-            "runs to a woman standing waiting.",
-        ],
-        "ending1": [
-            "pans up to show another woman running down the track.",
-            "pans around the two.",
-            "captures her as well as lifting weights down in place.",
-            "also lifts it onto his chest before hanging it back out again.",
-            "tackles him into the passenger seat.",
-        ],
-        "ending2": [
-            "follows her movements as the group members follow her instructions.",
-            "captures her as well as lifting weights down in place.",
-            "follows her spinning her body around and ends by walking down a lane.",
-            "spins around and lifts a barbell onto the floor.",
-            "pounds his fist against a cupboard.",
-        ],
-        "ending3": [
-            "follows her spinning her body around and ends by walking down a lane.",
-            "follows her movements as the group members follow her instructions.",
-            "pans around the two.",
-            "bends down and lifts the weight over his head.",
-            "offers someone the cup on his elbow and strides out.",
-        ],
-        "label": [1, 3, 0, 0, 2],
-    }
-    dev_data = {
-        "video-id": [
-            "lsmdc3001_21_JUMP_STREET-422",
-            "lsmdc0001_American_Beauty-45991",
-            "lsmdc0001_American_Beauty-45991",
-            "lsmdc0001_American_Beauty-45991",
-        ],
-        "fold-ind": ["11783", "10977", "10970", "10968"],
-        "startphrase": [
-            "Firing wildly he shoots holes through the tanker. He",
-            "He puts his spatula down. The Mercedes",
-            "He stands and looks around, his eyes finally landing on: "
-            "The digicam and a stack of cassettes on a shelf. Someone",
-            "He starts going through someone's bureau. He opens the drawer "
-            "in which we know someone keeps his marijuana, but he",
-        ],
-        "sent1": [
-            "Firing wildly he shoots holes through the tanker.",
-            "He puts his spatula down.",
-            "He stands and looks around, his eyes finally landing on: "
-            "The digicam and a stack of cassettes on a shelf.",
-            "He starts going through someone's bureau.",
-        ],
-        "sent2": [
-            "He",
-            "The Mercedes",
-            "Someone",
-            "He opens the drawer in which we know someone keeps his marijuana, but he",
-        ],
-        "gold-source": ["gold", "gold", "gold", "gold"],
-        "ending0": [
-            "overtakes the rig and falls off his bike.",
-            "fly open and drinks.",
-            "looks at someone's papers.",
-            "stops one down and rubs a piece of the gift out.",
-        ],
-        "ending1": [
-            "squeezes relentlessly on the peanut jelly as well.",
-            "walks off followed driveway again.",
-            "feels around it and falls in the seat once more.",
-            "cuts the mangled parts.",
-        ],
-        "ending2": [
-            "scrambles behind himself and comes in other directions.",
-            "slots them into a separate green.",
-            "sprints back from the wreck and drops onto his back.",
-            "hides it under his hat to watch.",
-        ],
-        "ending3": [
-            "sweeps a explodes and knocks someone off.",
-            "pulls around to the drive - thru window.",
-            "sits at the kitchen table, staring off into space.",
-            "does n't discover its false bottom.",
-        ],
-        "label": [0, 3, 3, 3],
-    }
-    test_data = {
-        "video-id": [
-            "lsmdc0001_American_Beauty-45991",
-            "lsmdc0001_American_Beauty-45991",
-            "lsmdc0001_American_Beauty-45991",
-            "lsmdc0001_American_Beauty-45991",
-        ],
-        "fold-ind": ["10980", "10976", "10978", "10969"],
-        "startphrase": [
-            "Someone leans out of the drive - thru window, "
-            "grinning at her, holding bags filled with fast food. The Counter Girl",
-            "Someone looks up suddenly when he hears. He",
-            "Someone drives; someone sits beside her. They",
-            "He opens the drawer in which we know someone "
-            "keeps his marijuana, but he does n't discover"
-            " its false bottom. He stands and looks around, his eyes",
-        ],
-        "sent1": [
-            "Someone leans out of the drive - thru "
-            "window, grinning at her, holding bags filled with fast food.",
-            "Someone looks up suddenly when he hears.",
-            "Someone drives; someone sits beside her.",
-            "He opens the drawer in which we know"
-            " someone keeps his marijuana, but he does n't discover its false bottom.",
-        ],
-        "sent2": [
-            "The Counter Girl",
-            "He",
-            "They",
-            "He stands and looks around, his eyes",
-        ],
-        "gold-source": ["gold", "gold", "gold", "gold"],
-        "ending0": [
-            "stands next to him, staring blankly.",
-            "puts his spatula down.",
-            "rise someone's feet up.",
-            "moving to the side, the houses rapidly stained.",
-        ],
-        "ending1": [
-            "with auditorium, filmed, singers the club.",
-            "bumps into a revolver and drops surreptitiously into his weapon.",
-            "lift her and they are alarmed.",
-            "focused as the sight of someone making his way down a trail.",
-        ],
-        "ending2": [
-            "attempts to block her ransacked.",
-            "talks using the phone and walks away for a few seconds.",
-            "are too involved with each other to "
-            "notice someone watching them from the drive - thru window.",
-            "finally landing on: the digicam and a stack of cassettes on a shelf.",
-        ],
-        "ending3": [
-            "is eating solid and stinky.",
-            "bundles the flaxen powder beneath the car.",
-            "sit at a table with a beer from a table.",
-            "deep and continuing, its bleed - length sideburns pressing on him.",
-        ],
-        "label": [0, 0, 2, 2],
-    }
-
-    train_dataset = pd.DataFrame(train_data)
-    dev_dataset = pd.DataFrame(dev_data)
-    test_dataset = pd.DataFrame(test_data)
-
-    custom_sent_keys = [
-        "sent1",
-        "sent2",
-        "ending0",
-        "ending1",
-        "ending2",
-        "ending3",
-        "gold-source",
-        "video-id",
-        "startphrase",
-        "fold-ind",
-    ]
-    label_key = "label"
-
-    X_train = train_dataset[custom_sent_keys]
-    y_train = train_dataset[label_key]
-
-    X_val = dev_dataset[custom_sent_keys]
-    y_val = dev_dataset[label_key]
-
-    X_test = test_dataset[custom_sent_keys]
-    X_true = test_dataset[label_key]
+    (
+        X_train,
+        y_train,
+        X_val,
+        y_val,
+        X_test,
+        y_test,
+    ) = get_toy_data_multiplechoiceclassification()
    automl = AutoML()

-    automl_settings = {
-        "gpu_per_trial": 0,
-        "max_iter": 2,
-        "time_budget": 5,
-        "task": "multichoice-classification",
-        "metric": "accuracy",
-        "log_file_name": "seqclass.log",
-    }
-
-    automl_settings["hf_args"] = {
-        "model_path": "google/electra-small-discriminator",
-        "output_dir": "test/data/output/",
-        "ckpt_per_epoch": 1,
-        "fp16": False,
-    }
+    automl_settings = get_automl_settings()
+    automl_settings["task"] = "multichoice-classification"
+    automl_settings["metric"] = "accuracy"

    try:
        automl.fit(
@@ -238,10 +37,10 @@ def test_mcc():
    proba = automl.predict_proba(X_test)
    print(str(len(automl.classes_)) + " classes")
    print(y_pred)
-    print(X_true)
+    print(y_test)
    print(proba)
    true_count = 0
-    for i, v in X_true.items():
+    for i, v in y_test.items():
        if y_pred[i] == v:
            true_count += 1
    accuracy = round(true_count / len(y_pred), 5)
--- a/test/nlp/test_autohf_regression.py
+++ b/test/nlp/test_autohf_regression.py
@@ -1,5 +1,6 @@
 import sys
 import pytest
+from utils import get_toy_data_seqregression, get_automl_settings


@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
@@ -12,71 +13,16 @@ def test_regression():
    except ImportError:
        return
    from flaml import AutoML
-    import pandas as pd

-    train_data = {
-        "sentence1": [
-            "A plane is taking off.",
-            "A man is playing a large flute.",
-            "A man is spreading shreded cheese on a pizza.",
-            "Three men are playing chess.",
-        ],
-        "sentence2": [
-            "An air plane is taking off.",
-            "A man is playing a flute.",
-            "A man is spreading shredded cheese on an uncooked pizza.",
-            "Two men are playing chess.",
-        ],
-        "label": [5.0, 3.799999952316284, 3.799999952316284, 2.5999999046325684],
-        "idx": [0, 1, 2, 3],
-    }
-    train_dataset = pd.DataFrame(train_data)
-
-    dev_data = {
-        "sentence1": [
-            "A man is playing the cello.",
-            "Some men are fighting.",
-            "A man is smoking.",
-            "The man is playing the piano.",
-        ],
-        "sentence2": [
-            "A man seated is playing the cello.",
-            "Two men are fighting.",
-            "A man is skating.",
-            "The man is playing the guitar.",
-        ],
-        "label": [4.25, 4.25, 0.5, 1.600000023841858],
-        "idx": [4, 5, 6, 7],
-    }
-    dev_dataset = pd.DataFrame(dev_data)
-
-    custom_sent_keys = ["sentence1", "sentence2"]
-    label_key = "label"
-
-    X_train = train_dataset[custom_sent_keys]
-    y_train = train_dataset[label_key]
-
-    X_val = dev_dataset[custom_sent_keys]
-    y_val = dev_dataset[label_key]
+    X_train, y_train, X_val, y_val = get_toy_data_seqregression()

    automl = AutoML()
+    automl_settings = get_automl_settings()

-    automl_settings = {
-        "gpu_per_trial": 0,
-        "max_iter": 2,
-        "time_budget": 5,
-        "task": "seq-regression",
-        "metric": "pearsonr",
-        "starting_points": {"transformer": {"num_train_epochs": 1}},
-        "use_ray": {"local_dir": "data/outut/"},
-    }
-
-    automl_settings["hf_args"] = {
-        "model_path": "google/electra-small-discriminator",
-        "output_dir": "test/data/output/",
-        "ckpt_per_epoch": 1,
-        "fp16": False,
-    }
+    automl_settings["task"] = "seq-regression"
+    automl_settings["metric"] = "pearsonr"
+    automl_settings["starting_points"] = {"transformer": {"num_train_epochs": 1}}
+    automl_settings["use_ray"] = {"local_dir": "data/outut/"}

    ray.shutdown()
    ray.init()
--- a/test/nlp/test_autohf_summarization.py
+++ b/test/nlp/test_autohf_summarization.py
@@ -1,69 +1,24 @@
 import sys
 import pytest
 import requests
+from utils import get_toy_data_summarization, get_automl_settings


@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
 def test_summarization():
    from flaml import AutoML
-    from pandas import DataFrame

-    train_dataset = DataFrame(
-        [
-            ("The cat is alive", "The cat is dead"),
-            ("The cat is alive", "The cat is dead"),
-            ("The cat is alive", "The cat is dead"),
-            ("The cat is alive", "The cat is dead"),
-        ]
-    )
-    dev_dataset = DataFrame(
-        [
-            ("The old woman is beautiful", "The old woman is ugly"),
-            ("The old woman is beautiful", "The old woman is ugly"),
-            ("The old woman is beautiful", "The old woman is ugly"),
-            ("The old woman is beautiful", "The old woman is ugly"),
-        ]
-    )
-    test_dataset = DataFrame(
-        [
-            ("The purse is cheap", "The purse is expensive"),
-            ("The purse is cheap", "The purse is expensive"),
-            ("The purse is cheap", "The purse is expensive"),
-            ("The purse is cheap", "The purse is expensive"),
-        ]
-    )
-
-    for each_dataset in [train_dataset, dev_dataset, test_dataset]:
-        each_dataset.columns = ["document", "summary"]
-
-    custom_sent_keys = ["document"]
-    label_key = "summary"
-
-    X_train = train_dataset[custom_sent_keys]
-    y_train = train_dataset[label_key]
-
-    X_val = dev_dataset[custom_sent_keys]
-    y_val = dev_dataset[label_key]
-
-    X_test = test_dataset[custom_sent_keys]
+    X_train, y_train, X_val, y_val, X_test = get_toy_data_summarization()

    automl = AutoML()
+    automl_settings = get_automl_settings()

-    automl_settings = {
-        "gpu_per_trial": 0,
-        "max_iter": 3,
-        "time_budget": 20,
-        "task": "summarization",
-        "metric": "rouge1",
-        "log_file_name": "seqclass.log",
-    }
-
-    automl_settings["hf_args"] = {
-        "model_path": "patrickvonplaten/t5-tiny-random",
-        "output_dir": "test/data/output/",
-        "ckpt_per_epoch": 1,
-        "fp16": False,
-    }
+    automl_settings["task"] = "summarization"
+    automl_settings["metric"] = "rouge1"
+    automl_settings["time_budget"] = 2 * automl_settings["time_budget"]
+    automl_settings["fit_kwargs_by_estimator"]["transformer"][
+        "model_path"
+    ] = "patrickvonplaten/t5-tiny-random"

    try:
        automl.fit(
@@ -75,7 +30,11 @@ def test_summarization():
        )
    except requests.exceptions.HTTPError:
        return
-    automl = AutoML()
+
+    automl_settings.pop("max_iter", None)
+    automl_settings.pop("use_ray", None)
+    automl_settings.pop("estimator_list", None)
+
    automl.retrain_from_log(
        X_train=X_train,
        y_train=y_train,
--- a/test/nlp/test_autohf_tokenclassification.py
+++ b/test/nlp/test_autohf_tokenclassification.py
@@ -1,737 +1,19 @@
 import sys
 import pytest
 import requests
+from utils import get_toy_data_tokenclassification, get_automl_settings


@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
 def test_tokenclassification():
    from flaml import AutoML
-    import pandas as pd
-
-    train_data = {
-        "chunk_tags": [
-            [11, 21, 11, 12, 21, 22, 11, 12, 0],
-            [11, 12],
-            [11, 12],
-            [
-                11,
-                12,
-                12,
-                21,
-                13,
-                11,
-                11,
-                21,
-                13,
-                11,
-                12,
-                13,
-                11,
-                21,
-                22,
-                11,
-                12,
-                17,
-                11,
-                21,
-                17,
-                11,
-                12,
-                12,
-                21,
-                22,
-                22,
-                13,
-                11,
-                0,
-            ],
-        ],
-        "id": ["0", "1", "2", "3"],
-        "ner_tags": [
-            [3, 0, 7, 0, 0, 0, 7, 0, 0],
-            [1, 2],
-            [5, 0],
-            [
-                0,
-                3,
-                4,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                7,
-                0,
-                0,
-                0,
-                0,
-                0,
-                7,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-            ],
-        ],
-        "pos_tags": [
-            [22, 42, 16, 21, 35, 37, 16, 21, 7],
-            [22, 22],
-            [22, 11],
-            [
-                12,
-                22,
-                22,
-                38,
-                15,
-                22,
-                28,
-                38,
-                15,
-                16,
-                21,
-                35,
-                24,
-                35,
-                37,
-                16,
-                21,
-                15,
-                24,
-                41,
-                15,
-                16,
-                21,
-                21,
-                20,
-                37,
-                40,
-                35,
-                21,
-                7,
-            ],
-        ],
-        "tokens": [
-            [
-                "EU",
-                "rejects",
-                "German",
-                "call",
-                "to",
-                "boycott",
-                "British",
-                "lamb",
-                ".",
-            ],
-            ["Peter", "Blackburn"],
-            ["BRUSSELS", "1996-08-22"],
-            [
-                "The",
-                "European",
-                "Commission",
-                "said",
-                "on",
-                "Thursday",
-                "it",
-                "disagreed",
-                "with",
-                "German",
-                "advice",
-                "to",
-                "consumers",
-                "to",
-                "shun",
-                "British",
-                "lamb",
-                "until",
-                "scientists",
-                "determine",
-                "whether",
-                "mad",
-                "cow",
-                "disease",
-                "can",
-                "be",
-                "transmitted",
-                "to",
-                "sheep",
-                ".",
-            ],
-        ],
-    }
-
-    dev_data = {
-        "chunk_tags": [
-            [
-                11,
-                11,
-                12,
-                13,
-                11,
-                12,
-                12,
-                11,
-                12,
-                12,
-                12,
-                12,
-                21,
-                13,
-                11,
-                12,
-                21,
-                22,
-                11,
-                13,
-                11,
-                1,
-                13,
-                11,
-                17,
-                11,
-                12,
-                12,
-                21,
-                1,
-                0,
-            ],
-            [
-                0,
-                11,
-                21,
-                22,
-                22,
-                11,
-                12,
-                12,
-                17,
-                11,
-                21,
-                22,
-                22,
-                11,
-                12,
-                13,
-                11,
-                0,
-                0,
-                11,
-                12,
-                11,
-                12,
-                12,
-                12,
-                12,
-                12,
-                12,
-                21,
-                11,
-                12,
-                12,
-                0,
-            ],
-            [
-                11,
-                21,
-                11,
-                12,
-                12,
-                21,
-                22,
-                0,
-                17,
-                11,
-                21,
-                22,
-                17,
-                11,
-                21,
-                22,
-                11,
-                21,
-                22,
-                22,
-                13,
-                11,
-                12,
-                12,
-                0,
-            ],
-            [
-                11,
-                21,
-                11,
-                12,
-                11,
-                12,
-                13,
-                11,
-                12,
-                12,
-                12,
-                12,
-                21,
-                22,
-                11,
-                12,
-                0,
-                11,
-                0,
-                11,
-                12,
-                13,
-                11,
-                12,
-                12,
-                12,
-                12,
-                12,
-                21,
-                11,
-                12,
-                1,
-                2,
-                2,
-                11,
-                21,
-                22,
-                11,
-                12,
-                0,
-            ],
-        ],
-        "id": ["4", "5", "6", "7"],
-        "ner_tags": [
-            [
-                5,
-                0,
-                0,
-                0,
-                0,
-                3,
-                4,
-                0,
-                0,
-                0,
-                1,
-                2,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                5,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-            ],
-            [
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                3,
-                0,
-                0,
-                0,
-                1,
-                2,
-                2,
-                2,
-                0,
-                0,
-                0,
-                0,
-                0,
-            ],
-            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0],
-            [
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                3,
-                0,
-                0,
-                1,
-                2,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-            ],
-        ],
-        "pos_tags": [
-            [
-                22,
-                27,
-                21,
-                35,
-                12,
-                22,
-                22,
-                27,
-                16,
-                21,
-                22,
-                22,
-                38,
-                15,
-                22,
-                24,
-                20,
-                37,
-                21,
-                15,
-                24,
-                16,
-                15,
-                22,
-                15,
-                12,
-                16,
-                21,
-                38,
-                17,
-                7,
-            ],
-            [
-                0,
-                28,
-                41,
-                30,
-                37,
-                12,
-                16,
-                21,
-                15,
-                28,
-                41,
-                30,
-                37,
-                12,
-                24,
-                15,
-                28,
-                6,
-                0,
-                12,
-                22,
-                27,
-                16,
-                21,
-                22,
-                22,
-                14,
-                22,
-                38,
-                12,
-                21,
-                21,
-                7,
-            ],
-            [
-                28,
-                38,
-                16,
-                16,
-                21,
-                38,
-                40,
-                10,
-                15,
-                28,
-                38,
-                40,
-                15,
-                21,
-                38,
-                40,
-                28,
-                20,
-                37,
-                40,
-                15,
-                12,
-                22,
-                22,
-                7,
-            ],
-            [
-                28,
-                38,
-                12,
-                21,
-                16,
-                21,
-                15,
-                22,
-                22,
-                22,
-                22,
-                22,
-                35,
-                37,
-                21,
-                24,
-                6,
-                24,
-                10,
-                16,
-                24,
-                15,
-                12,
-                21,
-                10,
-                21,
-                21,
-                24,
-                38,
-                12,
-                30,
-                16,
-                10,
-                16,
-                21,
-                35,
-                37,
-                16,
-                21,
-                7,
-            ],
-        ],
-        "tokens": [
-            [
-                "Germany",
-                "'s",
-                "representative",
-                "to",
-                "the",
-                "European",
-                "Union",
-                "'s",
-                "veterinary",
-                "committee",
-                "Werner",
-                "Zwingmann",
-                "said",
-                "on",
-                "Wednesday",
-                "consumers",
-                "should",
-                "buy",
-                "sheepmeat",
-                "from",
-                "countries",
-                "other",
-                "than",
-                "Britain",
-                "until",
-                "the",
-                "scientific",
-                "advice",
-                "was",
-                "clearer",
-                ".",
-            ],
-            [
-                '"',
-                "We",
-                "do",
-                "n't",
-                "support",
-                "any",
-                "such",
-                "recommendation",
-                "because",
-                "we",
-                "do",
-                "n't",
-                "see",
-                "any",
-                "grounds",
-                "for",
-                "it",
-                ",",
-                '"',
-                "the",
-                "Commission",
-                "'s",
-                "chief",
-                "spokesman",
-                "Nikolaus",
-                "van",
-                "der",
-                "Pas",
-                "told",
-                "a",
-                "news",
-                "briefing",
-                ".",
-            ],
-            [
-                "He",
-                "said",
-                "further",
-                "scientific",
-                "study",
-                "was",
-                "required",
-                "and",
-                "if",
-                "it",
-                "was",
-                "found",
-                "that",
-                "action",
-                "was",
-                "needed",
-                "it",
-                "should",
-                "be",
-                "taken",
-                "by",
-                "the",
-                "European",
-                "Union",
-                ".",
-            ],
-            [
-                "He",
-                "said",
-                "a",
-                "proposal",
-                "last",
-                "month",
-                "by",
-                "EU",
-                "Farm",
-                "Commissioner",
-                "Franz",
-                "Fischler",
-                "to",
-                "ban",
-                "sheep",
-                "brains",
-                ",",
-                "spleens",
-                "and",
-                "spinal",
-                "cords",
-                "from",
-                "the",
-                "human",
-                "and",
-                "animal",
-                "food",
-                "chains",
-                "was",
-                "a",
-                "highly",
-                "specific",
-                "and",
-                "precautionary",
-                "move",
-                "to",
-                "protect",
-                "human",
-                "health",
-                ".",
-            ],
-        ],
-    }
-
-    train_dataset = pd.DataFrame(train_data)
-    dev_dataset = pd.DataFrame(dev_data)
-
-    custom_sent_keys = ["tokens"]
-    label_key = "ner_tags"
-
-    X_train = train_dataset[custom_sent_keys]
-    y_train = train_dataset[label_key]
-
-    X_val = dev_dataset[custom_sent_keys]
-    y_val = dev_dataset[label_key]

+    X_train, y_train, X_val, y_val = get_toy_data_tokenclassification()
    automl = AutoML()

-    automl_settings = {
-        "gpu_per_trial": 0,
-        "max_iter": 2,
-        "time_budget": 5,
-        "task": "token-classification",
-        "metric": "seqeval",
-    }
-
-    automl_settings["hf_args"] = {
-        "model_path": "bert-base-uncased",
-        "output_dir": "test/data/output/",
-        "ckpt_per_epoch": 1,
-        "fp16": False,
-    }
+    automl_settings = get_automl_settings()
+    automl_settings["task"] = "token-classification"
+    automl_settings["metric"] = "seqeval"

    try:
        automl.fit(
--- a/test/nlp/test_custom_hp.py
+++ b/test/nlp/test_custom_hp.py
@@ -0,0 +1,36 @@
+import sys
+import pytest
+from utils import get_toy_data_seqclassification, get_automl_settings
+
+
+@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
+def test_custom_hp_nlp():
+    from flaml import AutoML
+    import flaml
+
+    X_train, y_train, X_val, y_val, X_test = get_toy_data_seqclassification()
+
+    automl = AutoML()
+
+    automl_settings = get_automl_settings()
+    automl_settings["custom_hp"] = None
+    automl_settings["custom_hp"] = {
+        "transformer": {
+            "model_path": {
+                "domain": flaml.tune.choice(["google/electra-small-discriminator"]),
+            },
+            "num_train_epochs": {"domain": 3},
+        }
+    }
+    automl_settings["fit_kwargs_by_estimator"] = {
+        "transformer": {
+            "output_dir": "test/data/output/",
+            "ckpt_per_epoch": 1,
+            "fp16": False,
+        }
+    }
+    automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
+
+
+if __name__ == "__main__":
+    test_custom_hp_nlp()
--- a/test/nlp/test_default.py
+++ b/test/nlp/test_default.py
@@ -0,0 +1,155 @@
+from utils import get_toy_data_seqclassification, get_automl_settings
+import sys
+from flaml.default import portfolio
+
+
+def pop_args(fit_kwargs):
+    fit_kwargs.pop("max_iter", None)
+    fit_kwargs.pop("use_ray", None)
+    fit_kwargs.pop("estimator_list", None)
+    fit_kwargs.pop("time_budget", None)
+    fit_kwargs.pop("log_file_name", None)
+
+
+def test_build_portfolio(path="./test/nlp/default", strategy="greedy"):
+    sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task seq-classification --estimator transformer_ms --strategy {strategy}".split()
+    portfolio.main()
+
+
+def test_starting_point_not_in_search_space():
+    from flaml import AutoML
+
+    """
+        test starting_points located outside of the search space, and custom_hp is not set
+    """
+    this_estimator_name = "transformer"
+    X_train, y_train, X_val, y_val, _ = get_toy_data_seqclassification()
+
+    automl = AutoML()
+    automl_settings = get_automl_settings(estimator_name=this_estimator_name)
+
+    automl_settings["starting_points"] = {
+        this_estimator_name: [{"learning_rate": 2e-3}]
+    }
+
+    automl.fit(X_train, y_train, **automl_settings)
+    assert (
+        automl._search_states[this_estimator_name].init_config["learning_rate"] != 2e-3
+    )
+
+    """
+        test starting_points located outside of the search space, and custom_hp is set
+    """
+
+    from flaml import tune
+
+    X_train, y_train, X_val, y_val, _ = get_toy_data_seqclassification()
+
+    this_estimator_name = "transformer_ms"
+    automl = AutoML()
+    automl_settings = get_automl_settings(estimator_name=this_estimator_name)
+
+    automl_settings["custom_hp"] = {
+        this_estimator_name: {
+            "model_path": {
+                "domain": "albert-base-v2",
+            },
+            "learning_rate": {
+                "domain": tune.choice([1e-4, 1e-5]),
+            },
+        }
+    }
+    automl_settings["starting_points"] = "data:test/nlp/default/"
+    del automl_settings["fit_kwargs_by_estimator"][this_estimator_name]["model_path"]
+
+    automl.fit(X_train, y_train, **automl_settings)
+    assert (
+        len(automl._search_states[this_estimator_name].init_config) == 0
+    )  # check that init config is not updated, but search space is updated
+    assert (
+        automl._search_states[this_estimator_name].search_space["model_path"]
+        == "albert-base-v2"
+    )
+
+
+def test_points_to_evaluate():
+    from flaml import AutoML
+
+    X_train, y_train, X_val, y_val, _ = get_toy_data_seqclassification()
+
+    automl = AutoML()
+    automl_settings = get_automl_settings(estimator_name="transformer_ms")
+
+    automl_settings["estimator_list"] = ["transformer_ms"]
+    automl_settings["starting_points"] = "data"
+
+    del automl_settings["fit_kwargs_by_estimator"]["transformer_ms"]["model_path"]
+
+    automl.fit(X_train, y_train, **automl_settings)
+
+
+# TODO: implement _test_zero_shot_model
+def test_zero_shot_nomodel():
+    from flaml.default import preprocess_and_suggest_hyperparams
+
+    estimator_name = "transformer_ms"
+
+    location = "test/nlp/default"
+    X_train, y_train, X_val, y_val, X_test = get_toy_data_seqclassification()
+
+    automl_settings = get_automl_settings(estimator_name)
+
+    del automl_settings["fit_kwargs_by_estimator"][estimator_name]["model_path"]
+
+    (
+        hyperparams,
+        estimator_class,
+        X_train,
+        y_train,
+        _,
+        _,
+    ) = preprocess_and_suggest_hyperparams(
+        "seq-classification", X_train, y_train, estimator_name, location=location
+    )
+
+    model = estimator_class(
+        **hyperparams
+    )  # estimator_class is TransformersEstimatorModelSelection
+
+    fit_kwargs = automl_settings.pop("fit_kwargs_by_estimator", {}).get(estimator_name)
+    fit_kwargs.update(automl_settings)
+    pop_args(fit_kwargs)
+    model.fit(X_train, y_train, **fit_kwargs)
+
+
+def test_build_error_portfolio(path="./test/nlp/default", strategy="greedy"):
+    import os
+
+    os.remove("./test/nlp/default/transformer_ms/seq-classification.json")
+    sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures_err.csv --task seq-classification --estimator transformer_ms --strategy {strategy}".split()
+    portfolio.main()
+
+    from flaml.default import preprocess_and_suggest_hyperparams
+
+    estimator_name = "transformer_ms"
+
+    location = "test/nlp/default"
+    X_train, y_train, X_val, y_val, X_test = get_toy_data_seqclassification()
+
+    automl_settings = get_automl_settings(estimator_name)
+
+    del automl_settings["fit_kwargs_by_estimator"][estimator_name]["model_path"]
+
+    try:
+        (
+            hyperparams,
+            estimator_class,
+            X_train,
+            y_train,
+            _,
+            _,
+        ) = preprocess_and_suggest_hyperparams(
+            "seq-classification", X_train, y_train, estimator_name, location=location
+        )
+    except ValueError:
+        print("Feature not implemented")
--- a/test/nlp/utils.py
+++ b/test/nlp/utils.py
--- a/test/test_gpu.py
+++ b/test/test_gpu.py
@@ -81,11 +81,13 @@ def _test_hf_data():
        "use_ray": True,
    }

-    automl_settings["hf_args"] = {
-        "model_path": "facebook/muppet-roberta-base",
-        "output_dir": "test/data/output/",
-        "ckpt_per_epoch": 5,
-        "fp16": True,
+    automl_settings["fit_kwargs_by_estimator"] = {
+        "transformer": {
+            "model_path": "facebook/muppet-roberta-base",
+            "output_dir": "test/data/output/",
+            "ckpt_per_epoch": 5,
+            "fp16": True,
+        }
    }

    automl.fit(
--- a/website/docs/Examples/AutoML-NLP.md
+++ b/website/docs/Examples/AutoML-NLP.md
@@ -26,7 +26,12 @@ automl = AutoML()
 automl_settings = {
    "time_budget": 100,
    "task": "seq-classification",
-    "hf_args": {"output_dir": "data/output/"},  # setting the huggingface arguments: output directory
+    "fit_kwargs_by_estimator": {  
+        "transformer":
+       {
+           "output_dir": "data/output/"  # if model_path is not set, the default model is facebook/muppet-roberta-base: https://huggingface.co/facebook/muppet-roberta-base
+       }
+    },  # setting the huggingface arguments: output directory
    "gpu_per_trial": 1,                         # set to 0 if no GPU is available
 }
 automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
@@ -77,11 +82,13 @@ automl_settings = {
    "task": "seq-regression",
    "metric": "rmse",
 }
-automl_settings["hf_args"] = {                          # setting the huggingface arguments
-    "model_path": "google/electra-small-discriminator", # setting the language model
-    "output_dir": "data/output/",                       # setting the output directory
-    "ckpt_per_epoch": 5,                                # setting the number of checkpoints per epoch
-    "fp16": False,                                      # setting whether to use FP16
+automl_settings["fit_kwargs_by_estimator"] = {  # setting the huggingface arguments
+    "transformer": {
+        "model_path": "google/electra-small-discriminator", # if model_path is not set, the default model is facebook/muppet-roberta-base: https://huggingface.co/facebook/muppet-roberta-base
+        "output_dir": "data/output/",                       # setting the output directory
+        "ckpt_per_epoch": 5,                                # setting the number of checkpoints per epoch
+        "fp16": False,  
+    }   # setting whether to use FP16
 }
 automl.fit(
    X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
@@ -127,11 +134,13 @@ automl_settings = {
    "task": "summarization",
    "metric": "rouge1",
 }
-automl_settings["hf_args"] = {            # setting the huggingface arguments
-    "model_path": "t5-small",             # setting the language model
-    "output_dir": "data/output/",         # setting the output directory
-    "ckpt_per_epoch": 5,                  # setting the number of checkpoints per epoch
-    "fp16": False,                        # setting whether to use FP16
+automl_settings["fit_kwargs_by_estimator"] = {      # setting the huggingface arguments
+    "transformer": {
+        "model_path": "t5-small",             # if model_path is not set, the default model is t5-small: https://huggingface.co/t5-small
+        "output_dir": "data/output/",         # setting the output directory
+        "ckpt_per_epoch": 5,                  # setting the number of checkpoints per epoch
+        "fp16": False,  
+    } # setting whether to use FP16
 }
 automl.fit(
    X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings