Merge pull request #1 from microsoft/main

update
2026-04-20 03:02:16 -04:00 · 2022-01-11 08:32:50 +08:00
parent f3fc35c438 c54c1246c6
commit b66930e4f0
52 changed files with 3030 additions and 811 deletions
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -0,0 +1,18 @@
+preferred-citation:
+  type: inproceedings
+  authors:
+  - family-names: "Wang"
+    given-names: "Chi"
+    affiliation: "Microsoft Research, Redmond WA USA"
+  - family-names: "Wu"
+    given-names: "Qingyun"
+    affiliation: "Microsoft Research, Redmond WA USA"
+  - family-names: "Weimer"
+    given-names: "Markus"
+    affiliation: "Microsoft Corporation, Redmond WA USA"
+  - family-names: "Zhu"
+    given-names: "Eric"
+    affiliation: "Microsoft Research, Redmond WA USA"
+  booktitle: "Proceedings of the 4th MLSys Conference"
+  title: "FLAML: A Fast and Lightweight AutoML Library"
+  year: 2021
--- a/NOTICE.md
+++ b/NOTICE.md
@@ -1,6 +1,6 @@
 NOTICES

-This repository incorporates material as listed below or described in the code. 
+This repository incorporates material as listed below or described in the code.

 #
 ## Component. Ray.
@@ -11,7 +11,7 @@ https://github.com/ray-project/ray/blob/master/python/ray/tune/



-## Open Source License/Copyright Notice. 
+## Open Source License/Copyright Notice.

 Apache License
                           Version 2.0, January 2004
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -14,7 +14,7 @@ Instead, please report them to the Microsoft Security Response Center (MSRC) at

 If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).

-You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).

 Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:

--- a/docs/conf.py
+++ b/docs/conf.py
@@ -17,9 +17,9 @@

 # -- Project information -----------------------------------------------------

-project = 'FLAML'
-copyright = '2020-2021, FLAML Team'
-author = 'FLAML Team'
+project = "FLAML"
+copyright = "2020-2021, FLAML Team"
+author = "FLAML Team"


 # -- General configuration ---------------------------------------------------
@@ -28,23 +28,23 @@ author = 'FLAML Team'
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.napoleon',
-    'sphinx.ext.doctest',
-    'sphinx.ext.coverage',
-    'sphinx.ext.mathjax',
-    'sphinx.ext.viewcode',
-    'sphinx.ext.githubpages',
-    'sphinx_rtd_theme',
+    "sphinx.ext.autodoc",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.doctest",
+    "sphinx.ext.coverage",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.githubpages",
+    "sphinx_rtd_theme",
 ]

 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]

 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]


 # -- Options for HTML output -------------------------------------------------
@@ -52,9 +52,9 @@ exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"

 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -40,10 +40,12 @@ from .config import (
 from .data import (
    concat,
    CLASSIFICATION,
+    TOKENCLASSIFICATION,
    TS_FORECAST,
    FORECAST,
    REGRESSION,
    _is_nlp_task,
+    NLG_TASKS,
 )
 from . import tune
 from .training_log import training_log_reader, training_log_writer
@@ -71,7 +73,9 @@ class SearchState:
            self.total_time_used - self.time_best_found,
        )

-    def __init__(self, learner_class, data_size, task, starting_point=None):
+    def __init__(
+        self, learner_class, data_size, task, starting_point=None, period=None
+    ):
        self.init_eci = learner_class.cost_relative2lgbm()
        self._search_space_domain = {}
        self.init_config = {}
@@ -80,7 +84,12 @@ class SearchState:
        self.data_size = data_size
        self.ls_ever_converged = False
        self.learner_class = learner_class
-        search_space = learner_class.search_space(data_size=data_size, task=task)
+        if task == TS_FORECAST:
+            search_space = learner_class.search_space(
+                data_size=data_size, task=task, pred_horizon=period
+            )
+        else:
+            search_space = learner_class.search_space(data_size=data_size, task=task)
        for name, space in search_space.items():
            assert (
                "domain" in space
@@ -319,7 +328,10 @@ class AutoMLState:
            if self.time_budget is None
            else self.time_budget - self.time_from_start
        )
-        if self.resources_per_trial.get("gpu", 0) > 0:
+        if (
+            hasattr(self, "resources_per_trial")
+            and self.resources_per_trial.get("gpu", 0) > 0
+        ):

            def _trainable_function_wrapper(config: dict):

@@ -352,7 +364,6 @@ class AutoMLState:
            )
            result = list(analysis.results.values())[0]
            estimator, train_time = result["estimator"], result["train_time"]
-
        else:
            if _is_nlp_task(self.task):
                use_ray = self.fit_kwargs.get("use_ray")
@@ -369,10 +380,10 @@ class AutoMLState:
                fit_kwargs=self.fit_kwargs,
            )
            if _is_nlp_task(self.task):
-                if use_ray:
-                    self.fit_kwargs["use_ray"] = use_ray
-                else:
+                if use_ray is None:
                    del self.fit_kwargs["use_ray"]
+                else:
+                    self.fit_kwargs["use_ray"] = use_ray
        if sampled_weight is not None:
            self.fit_kwargs["sample_weight"] = weight
        return estimator, train_time
@@ -431,10 +442,8 @@ class AutoML(BaseEstimator):
        ):
            return metric_to_minimize, metrics_to_log
        ```
-
                which returns a float number as the minimization objective,
                and a dictionary as the metrics to log. E.g.,
-
        ```python
        def custom_metric(
            X_val, y_val, estimator, labels,
@@ -459,14 +468,13 @@ class AutoML(BaseEstimator):
        ```
            task: A string of the task type, e.g.,
                'classification', 'regression', 'ts_forecast', 'rank',
-                'seq-classification', 'seq-regression'.
+                'seq-classification', 'seq-regression', 'summarization'.
            n_jobs: An integer of the number of threads for training.
            gpu_per_trial: A float of the number of gpus per trial, only used by TransformersEstimator.
            log_file_name: A string of the log file name. To disable logging,
                set it to be an empty string "".
            estimator_list: A list of strings for estimator names, or 'auto'
                e.g., ```['lgbm', 'xgboost', 'xgb_limitdepth', 'catboost', 'rf', 'extra_tree']```
-
            time_budget: A float number of the time budget in seconds.
                Use -1 if no time limit.
            max_iter: An integer of the maximal number of iterations.
@@ -500,12 +508,13 @@ class AutoML(BaseEstimator):
                'budget' - do best effort to retrain without violating the time
                budget.
            split_type: str or splitter object, default="auto" | the data split type.
-                A valid splitter object is an instance of a derived class of scikit-learn KFold
-                (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
+                * A valid splitter object is an instance of a derived class of scikit-learn
+                [KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
                and have ``split`` and ``get_n_splits`` methods with the same signatures.
-                Valid str options depend on different tasks.
-                For classification tasks, valid choices are [
-                    "auto", 'stratified', 'uniform', 'time']. "auto" -> stratified.
+                Set eval_method to "cv" to use the splitter object.
+                * Valid str options depend on different tasks.
+                For classification tasks, valid choices are
+                    ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
                    "auto" -> uniform.
                For ts_forecast tasks, must be "auto" or 'time'.
@@ -528,7 +537,6 @@ class AutoML(BaseEstimator):
                `automl` object and use them in the `new_automl` object.
                e.g.,

-
        ```python
        from flaml import AutoML
        automl = AutoML()
@@ -540,7 +548,7 @@ class AutoML(BaseEstimator):
        new_automl.fit(X_train, y_train, starting_points=starting_points)
        ```

-            seed: int or None, default=None | The random seed for np.random.
+            seed: int or None, default=None | The random seed for hpo.
            n_concurrent_trials: [Experimental] int, default=1 | The number of
                concurrent trials. For n_concurrent_trials > 1, installation of
                ray is required: `pip install flaml[ray]`.
@@ -699,6 +707,10 @@ class AutoML(BaseEstimator):
            return attr.classes_.tolist()
        return None

+    @property
+    def n_features_in_(self):
+        return self._trained_estimator.n_features_in_
+
    @property
    def time_to_find_best_model(self) -> float:
        """Time taken to find best model in seconds."""
@@ -740,7 +752,11 @@ class AutoML(BaseEstimator):
            return None
        X_test = self._preprocess(X_test)
        y_pred = estimator.predict(X_test)
-        if y_pred.ndim > 1 and isinstance(y_pred, np.ndarray):
+        if (
+            isinstance(y_pred, np.ndarray)
+            and y_pred.ndim > 1
+            and isinstance(y_pred, np.ndarray)
+        ):
            y_pred = y_pred.flatten()
        if self._label_transformer:
            return self._label_transformer.inverse_transform(
@@ -799,6 +815,42 @@ class AutoML(BaseEstimator):
            X = self._transformer.transform(X)
        return X

+    def _validate_ts_data(
+        self,
+        dataframe,
+        y_train_all=None,
+    ):
+        assert (
+            dataframe[dataframe.columns[0]].dtype.name == "datetime64[ns]"
+        ), f"For '{TS_FORECAST}' task, the first column must contain timestamp values."
+        if y_train_all is not None:
+            y_df = (
+                pd.DataFrame(y_train_all)
+                if isinstance(y_train_all, pd.Series)
+                else pd.DataFrame(y_train_all, columns=["labels"])
+            )
+            dataframe = dataframe.join(y_df)
+        duplicates = dataframe.duplicated()
+        if any(duplicates):
+            logger.warning(
+                "Duplicate timestamp values found in timestamp column. "
+                f"\n{dataframe.loc[duplicates, dataframe][dataframe.columns[0]]}"
+            )
+            dataframe = dataframe.drop_duplicates()
+            logger.warning("Removed duplicate rows based on all columns")
+            assert (
+                dataframe[[dataframe.columns[0]]].duplicated() is None
+            ), "Duplicate timestamp values with different values for other columns."
+        ts_series = pd.to_datetime(dataframe[dataframe.columns[0]])
+        inferred_freq = pd.infer_freq(ts_series)
+        if inferred_freq is None:
+            logger.warning(
+                "Missing timestamps detected. To avoid error with estimators, set estimator list to ['prophet']. "
+            )
+        if y_train_all is not None:
+            return dataframe.iloc[:, :-1], dataframe.iloc[:, -1]
+        return dataframe
+
    def _validate_data(
        self,
        X_train_all,
@@ -837,9 +889,9 @@ class AutoML(BaseEstimator):
            self._nrow, self._ndim = X_train_all.shape
            if self._state.task == TS_FORECAST:
                X_train_all = pd.DataFrame(X_train_all)
-                assert (
-                    X_train_all[X_train_all.columns[0]].dtype.name == "datetime64[ns]"
-                ), f"For '{TS_FORECAST}' task, the first column must contain timestamp values."
+                X_train_all, y_train_all = self._validate_ts_data(
+                    X_train_all, y_train_all
+                )
            X, y = X_train_all, y_train_all
        elif dataframe is not None and label is not None:
            assert isinstance(
@@ -848,9 +900,7 @@ class AutoML(BaseEstimator):
            assert label in dataframe.columns, "label must a column name in dataframe"
            self._df = True
            if self._state.task == TS_FORECAST:
-                assert (
-                    dataframe[dataframe.columns[0]].dtype.name == "datetime64[ns]"
-                ), f"For '{TS_FORECAST}' task, the first column must contain timestamp values."
+                dataframe = self._validate_ts_data(dataframe)
            X = dataframe.drop(columns=label)
            self._nrow, self._ndim = X.shape
            y = dataframe[label]
@@ -859,6 +909,8 @@ class AutoML(BaseEstimator):

        # check the validity of input dimensions under the nlp mode
        if _is_nlp_task(self._state.task):
+            from .nlp.utils import is_a_list_of_str
+
            is_all_str = True
            is_all_list = True
            for column in X.columns:
@@ -867,17 +919,25 @@ class AutoML(BaseEstimator):
                    "string",
                ), "If the task is an NLP task, X can only contain text columns"
                for each_cell in X[column]:
-                    if each_cell:
+                    if each_cell is not None:
                        is_str = isinstance(each_cell, str)
                        is_list_of_int = isinstance(each_cell, list) and all(
                            isinstance(x, int) for x in each_cell
                        )
-                        assert is_str or is_list_of_int, (
-                            "Each column of the input must either be str (untokenized) "
-                            "or a list of integers (tokenized)"
-                        )
+                        is_list_of_str = is_a_list_of_str(each_cell)
+                        if self._state.task == TOKENCLASSIFICATION:
+                            assert is_list_of_str, (
+                                "For the token-classification task, the input column needs to be a list of string,"
+                                "instead of string, e.g., ['EU', 'rejects','German', 'call','to','boycott','British','lamb','.',].",
+                                "For more examples, please refer to test/nlp/test_autohf_tokenclassification.py",
+                            )
+                        else:
+                            assert is_str or is_list_of_int, (
+                                "Each column of the input must either be str (untokenized) "
+                                "or a list of integers (tokenized)"
+                            )
                        is_all_str &= is_str
-                        is_all_list &= is_list_of_int
+                        is_all_list &= is_list_of_int or is_list_of_str
            assert is_all_str or is_all_list, (
                "Currently FLAML only supports two modes for NLP: either all columns of X are string (non-tokenized), "
                "or all columns of X are integer ids (tokenized)"
@@ -920,6 +980,8 @@ class AutoML(BaseEstimator):
                self._state.X_val = self._transformer.transform(X_val)
            else:
                self._state.X_val = X_val
+            # If it's NLG_TASKS, y_val is a pandas series containing the output sequence tokens,
+            # so we cannot use label_transformer.transform to process it
            if self._label_transformer:
                self._state.y_val = self._label_transformer.transform(y_val)
            else:
@@ -954,6 +1016,7 @@ class AutoML(BaseEstimator):
            and self._auto_augment
            and self._state.fit_kwargs.get("sample_weight") is None
            and self._split_type in ["stratified", "uniform"]
+            and self._state.task != TOKENCLASSIFICATION
        ):
            # logger.info(f"label {pd.unique(y_train_all)}")
            label_set, counts = np.unique(y_train_all, return_counts=True)
@@ -1273,18 +1336,19 @@ class AutoML(BaseEstimator):
            time_budget: A float number of the time budget in seconds.
            task: A string of the task type, e.g.,
                'classification', 'regression', 'ts_forecast', 'rank',
-                'seq-classification', 'seq-regression'.
+                'seq-classification', 'seq-regression', 'summarization'.
            eval_method: A string of resampling strategy, one of
                ['auto', 'cv', 'holdout'].
            split_ratio: A float of the validation data percentage for holdout.
            n_splits: An integer of the number of folds for cross-validation.
            split_type: str or splitter object, default="auto" | the data split type.
-                A valid splitter object is an instance of a derived class of scikit-learn KFold
-                (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
+                * A valid splitter object is an instance of a derived class of scikit-learn
+                [KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
                and have ``split`` and ``get_n_splits`` methods with the same signatures.
-                Valid str options depend on different tasks.
-                For classification tasks, valid choices are [
-                    "auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
+                Set eval_method to "cv" to use the splitter object.
+                * Valid str options depend on different tasks.
+                For classification tasks, valid choices are
+                    ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
                    "auto" -> uniform.
                For ts_forecast tasks, must be "auto" or 'time'.
@@ -1431,6 +1495,9 @@ class AutoML(BaseEstimator):
            ), "groups must be specified for ranking task."
            assert split_type in ["auto", "group"]
            self._split_type = "group"
+        elif self._state.task in NLG_TASKS:
+            assert split_type in ["auto", "uniform", "time", "group"]
+            self._split_type = split_type if split_type != "auto" else "uniform"

    def _decide_eval_method(self, time_budget):
        if self._state.X_val is not None:
@@ -1704,7 +1771,6 @@ class AutoML(BaseEstimator):
                'mape'. Default is 'auto'.
                If passing a customized metric function, the function needs to
                have the follwing signature:
-
        ```python
        def custom_metric(
            X_test, y_test, estimator, labels,
@@ -1713,36 +1779,33 @@ class AutoML(BaseEstimator):
        ):
            return metric_to_minimize, metrics_to_log
        ```
-
                which returns a float number as the minimization objective,
                and a dictionary as the metrics to log. E.g.,
+        ```python
+        def custom_metric(
+            X_val, y_val, estimator, labels,
+            X_train, y_train, weight_val=None, weight_train=None,
+            **args,
+        ):
+            from sklearn.metrics import log_loss
+            import time

-                .. code-block:: python
-
-                    def custom_metric(
-                        X_val, y_val, estimator, labels,
-                        X_train, y_train, weight_val=None, weight_train=None,
-                        **args,
-                    ):
-                        from sklearn.metrics import log_loss
-                        import time
-
-                        start = time.time()
-                        y_pred = estimator.predict_proba(X_val)
-                        pred_time = (time.time() - start) / len(X_val)
-                        val_loss = log_loss(y_val, y_pred, labels=labels, sample_weight=weight_val)
-                        y_pred = estimator.predict_proba(X_train)
-                        train_loss = log_loss(y_train, y_pred, labels=labels, sample_weight=weight_train)
-                        alpha = 0.5
-                        return val_loss * (1 + alpha) - alpha * train_loss, {
-                            "val_loss": val_loss,
-                            "train_loss": train_loss,
-                            "pred_time": pred_time,
-                        }
-
+            start = time.time()
+            y_pred = estimator.predict_proba(X_val)
+            pred_time = (time.time() - start) / len(X_val)
+            val_loss = log_loss(y_val, y_pred, labels=labels, sample_weight=weight_val)
+            y_pred = estimator.predict_proba(X_train)
+            train_loss = log_loss(y_train, y_pred, labels=labels, sample_weight=weight_train)
+            alpha = 0.5
+            return val_loss * (1 + alpha) - alpha * train_loss, {
+                "val_loss": val_loss,
+                "train_loss": train_loss,
+                "pred_time": pred_time,
+            }
+        ```
            task: A string of the task type, e.g.,
                'classification', 'regression', 'ts_forecast', 'rank',
-                'seq-classification', 'seq-regression'.
+                'seq-classification', 'seq-regression', 'summarization'
            n_jobs: An integer of the number of threads for training.
            gpu_per_trial: A float of the number of gpus per trial, only used by TransformersEstimator.
            log_file_name: A string of the log file name. To disable logging,
@@ -1795,12 +1858,13 @@ class AutoML(BaseEstimator):
                'budget' - do best effort to retrain without violating the time
                budget.
            split_type: str or splitter object, default="auto" | the data split type.
-                A valid splitter object is an instance of a derived class of scikit-learn KFold
-                (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
+                * A valid splitter object is an instance of a derived class of scikit-learn
+                [KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
                and have ``split`` and ``get_n_splits`` methods with the same signatures.
-                Valid str options depend on different tasks.
-                For classification tasks, valid choices are [
-                    "auto", 'stratified', 'uniform', 'time']. "auto" -> stratified.
+                Set eval_method to "cv" to use the splitter object.
+                * Valid str options depend on different tasks.
+                For classification tasks, valid choices are
+                    ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
                    "auto" -> uniform.
                For ts_forecast tasks, must be "auto" or 'time'.
@@ -1834,7 +1898,7 @@ class AutoML(BaseEstimator):
        new_automl.fit(X_train, y_train, starting_points=starting_points)
        ```

-            seed: int or None, default=None | The random seed for np.random.
+            seed: int or None, default=None | The random seed for hpo.
            n_concurrent_trials: [Experimental] int, default=1 | The number of
                concurrent trials. For n_concurrent_trials > 1, installation of
                ray is required: `pip install flaml[ray]`.
@@ -1938,13 +2002,10 @@ class AutoML(BaseEstimator):
        )
        self._search_states = {}  # key: estimator name; value: SearchState
        self._random = np.random.RandomState(RANDOM_SEED)
-        if seed is not None:
-            np.random.seed(seed)
-        self._seed = seed + 19823 if seed is not None else 20
+        self._seed = seed if seed is not None else 20
        self._learner_selector = learner_selector
        old_level = logger.getEffectiveLevel()
        self.verbose = verbose
-        # if verbose == 0:
        logger.setLevel(50 - verbose * 10)
        if (not mlflow or not mlflow.active_run()) and not logger.handlers:
            # Add the console handler.
@@ -1965,11 +2026,18 @@ class AutoML(BaseEstimator):
        self._use_ray = use_ray or n_concurrent_trials > 1
        # use the following condition if we have an estimation of average_trial_time and average_trial_overhead
        # self._use_ray = use_ray or n_concurrent_trials > ( average_trail_time + average_trial_overhead) / (average_trial_time)
-        self._state.resources_per_trial = (
-            {"cpu": int(os.cpu_count() / n_concurrent_trials), "gpu": gpu_per_trial}
-            if n_jobs < 0
-            else {"cpu": n_jobs, "gpu": gpu_per_trial}
-        )
+        if self._use_ray:
+            import ray
+
+            n_cpus = use_ray and ray.available_resources()["CPU"] or os.cpu_count()
+            self._state.resources_per_trial = (
+                # when using gpu, default cpu is 1 per job; otherwise, default cpu is n_cpus / n_concurrent_trials
+                {"cpu": max(int(n_cpus / n_concurrent_trials), 1), "gpu": gpu_per_trial}
+                if gpu_per_trial == 0
+                else {"cpu": 1, "gpu": gpu_per_trial}
+                if n_jobs < 0
+                else {"cpu": n_jobs, "gpu": gpu_per_trial}
+            )
        self._retrain_in_budget = retrain_full == "budget" and (
            eval_method == "holdout" and self._state.X_val is None
        )
@@ -2052,14 +2120,7 @@ class AutoML(BaseEstimator):
        logger.info(f"Minimizing error metric: {error_metric}")

        if "auto" == estimator_list:
-            if self._state.task == TS_FORECAST:
-                try:
-                    import prophet
-
-                    estimator_list = ["prophet", "arima", "sarimax"]
-                except ImportError:
-                    estimator_list = ["arima", "sarimax"]
-            elif self._state.task == "rank":
+            if self._state.task == "rank":
                estimator_list = ["lgbm", "xgboost", "xgb_limitdepth"]
            elif _is_nlp_task(self._state.task):
                estimator_list = ["transformer"]
@@ -2083,8 +2144,18 @@ class AutoML(BaseEstimator):
                        "extra_tree",
                        "xgb_limitdepth",
                    ]
-                if "regression" != self._state.task:
+                if TS_FORECAST == self._state.task:
+                    # catboost is removed because it has a `name` parameter, making it incompatible with hcrystalball
+                    estimator_list.remove("catboost")
+                    try:
+                        import prophet
+
+                        estimator_list += ["prophet", "arima", "sarimax"]
+                    except ImportError:
+                        estimator_list += ["arima", "sarimax"]
+                elif "regression" != self._state.task:
                    estimator_list += ["lrl1"]
+
        for estimator_name in estimator_list:
            if estimator_name not in self._state.learner_classes:
                self.add_learner(
@@ -2100,6 +2171,7 @@ class AutoML(BaseEstimator):
                data_size=self._state.data_size,
                task=self._state.task,
                starting_point=starting_points.get(estimator_name),
+                period=self._state.fit_kwargs.get("period"),
            )
        logger.info("List of ML learners in AutoML Run: {}".format(estimator_list))
        self.estimator_list = estimator_list
@@ -2157,7 +2229,6 @@ class AutoML(BaseEstimator):
            del self._state.y_train, self._state.y_train_all, self._state.y_val
            del self._sample_weight_full, self._state.fit_kwargs
            del self._state.groups, self._state.groups_all, self._state.groups_val
-        # if verbose == 0:
        logger.setLevel(old_level)

    def _search_parallel(self):
@@ -2244,7 +2315,7 @@ class AutoML(BaseEstimator):
                trial
                for trial in analysis.trials
                if trial.last_result
-                and trial.last_result["wall_clock_time"] is not None
+                and trial.last_result.get("wall_clock_time") is not None
            ),
            key=lambda x: x.last_result["wall_clock_time"],
        )
@@ -2256,8 +2327,9 @@ class AutoML(BaseEstimator):
                estimator = config.get("ml", config)["learner"]
                search_state = self._search_states[estimator]
                search_state.update(result, 0)
-                if result["wall_clock_time"] is not None:
-                    self._state.time_from_start = result["wall_clock_time"]
+                wall_time = result.get("wall_clock_time")
+                if wall_time is not None:
+                    self._state.time_from_start = wall_time
                if search_state.sample_size == self._state.data_size[0]:
                    self._iter_per_learner[estimator] += 1
                    if not self._fullsize_reached:
@@ -2275,17 +2347,34 @@ class AutoML(BaseEstimator):
                    self._time_taken_best_iter = self._state.time_from_start
                    better = True
                    self._search_states[estimator].best_config = config
-                if (better or self._log_type == "all") and self._training_log:
-                    self._training_log.append(
-                        self._iter_per_learner[estimator],
-                        search_state.metric_for_logging,
-                        search_state.trial_time,
-                        self._state.time_from_start,
-                        search_state.val_loss,
-                        config,
-                        estimator,
-                        search_state.sample_size,
-                    )
+                if better or self._log_type == "all":
+                    self._log_trial(search_state, estimator)
+
+    def _log_trial(self, search_state, estimator):
+        if self._training_log:
+            self._training_log.append(
+                self._iter_per_learner[estimator],
+                search_state.metric_for_logging,
+                search_state.trial_time,
+                self._state.time_from_start,
+                search_state.val_loss,
+                search_state.config,
+                estimator,
+                search_state.sample_size,
+            )
+        if mlflow is not None and mlflow.active_run():
+            with mlflow.start_run(nested=True):
+                mlflow.log_metric("iter_counter", self._iter_per_learner[estimator])
+                mlflow.log_param("metric_for_logging", search_state.metric_for_logging)
+                mlflow.log_metric("trial_time", search_state.trial_time)
+                mlflow.log_metric("wall_clock_time", self._state.time_from_start)
+                mlflow.log_metric("validation_loss", search_state.val_loss)
+                mlflow.log_param("config", search_state.config)
+                mlflow.log_param("learner", estimator)
+                mlflow.log_param("sample_size", search_state.sample_size)
+                mlflow.log_metric("best_validation_loss", search_state.best_loss)
+                mlflow.log_param("best_config", search_state.best_config)
+                mlflow.log_param("best_learner", self._best_estimator)

    def _search_sequential(self):
        try:
@@ -2458,8 +2547,9 @@ class AutoML(BaseEstimator):
                        f"Estimated sufficient time budget={max_budget:.0f}s."
                        f" Estimated necessary time budget={min_budget:.0f}s."
                    )
-                if result["wall_clock_time"] is not None:
-                    self._state.time_from_start = result["wall_clock_time"]
+                wall_time = result.get("wall_clock_time")
+                if wall_time is not None:
+                    self._state.time_from_start = wall_time
                # logger.info(f"{self._search_states[estimator].sample_size}, {data_size}")
                if search_state.sample_size == self._state.data_size[0]:
                    self._iter_per_learner[estimator] += 1
@@ -2497,38 +2587,8 @@ class AutoML(BaseEstimator):
                ):
                    search_state.trained_estimator.cleanup()
                if better or self._log_type == "all":
-                    if self._training_log:
-                        self._training_log.append(
-                            self._iter_per_learner[estimator],
-                            search_state.metric_for_logging,
-                            search_state.trial_time,
-                            self._state.time_from_start,
-                            search_state.val_loss,
-                            search_state.config,
-                            estimator,
-                            search_state.sample_size,
-                        )
-                    if mlflow is not None and mlflow.active_run():
-                        with mlflow.start_run(nested=True):
-                            mlflow.log_metric(
-                                "iter_counter", self._iter_per_learner[estimator]
-                            )
-                            mlflow.log_param(
-                                "metric_for_logging", search_state.metric_for_logging
-                            )
-                            mlflow.log_metric("trial_time", search_state.trial_time)
-                            mlflow.log_metric(
-                                "wall_clock_time", self._state.time_from_start
-                            )
-                            mlflow.log_metric("validation_loss", search_state.val_loss)
-                            mlflow.log_param("config", search_state.config)
-                            mlflow.log_param("learner", estimator)
-                            mlflow.log_param("sample_size", search_state.sample_size)
-                            mlflow.log_metric(
-                                "best_validation_loss", search_state.best_loss
-                            )
-                            mlflow.log_param("best_config", search_state.best_config)
-                            mlflow.log_param("best_learner", self._best_estimator)
+                    self._log_trial(search_state, estimator)
+
                logger.info(
                    " at {:.1f}s,\testimator {}'s best error={:.4f},\tbest estimator {}'s best error={:.4f}".format(
                        self._state.time_from_start,
@@ -2637,6 +2697,7 @@ class AutoML(BaseEstimator):
            )
            if self._trained_estimator:
                logger.info(f"selected model: {self._trained_estimator.model}")
+            estimators = []
            if self._ensemble and self._state.task in (
                "binary",
                "multi",
@@ -2670,8 +2731,7 @@ class AutoML(BaseEstimator):
                    if x[1].best_loss < 4 * self._selected.best_loss
                ]
                logger.info(estimators)
-                if len(estimators) <= 1:
-                    return
+            if len(estimators) > 1:
                if self._state.task in CLASSIFICATION:
                    from sklearn.ensemble import StackingClassifier as Stacker
                else:
@@ -2729,6 +2789,7 @@ class AutoML(BaseEstimator):
                if (
                    self._state.task == TS_FORECAST
                    or self._trained_estimator is None
+                    or self._trained_estimator.model is None
                    or (
                        self._state.time_budget - self._state.time_from_start
                        > self._selected.est_retrain_time(self.data_size_full)
@@ -2755,8 +2816,6 @@ class AutoML(BaseEstimator):
                        logger.info(f"retrained model: {self._trained_estimator.model}")
                else:
                    logger.info("not retraining because the time budget is too small.")
-        if self.model and mlflow is not None and mlflow.active_run():
-            mlflow.sklearn.log_model(self.model, "best_model")

    def __del__(self):
        if (
--- a/flaml/config.py
+++ b/flaml/config.py
@@ -1,7 +1,7 @@
-'''!
+"""!
 * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License.
-'''
+"""

 N_SPLITS = 5
 RANDOM_SEED = 1
--- a/flaml/data.py
+++ b/flaml/data.py
@@ -14,7 +14,16 @@ from typing import Dict, Union, List

 # TODO: if your task is not specified in here, define your task as an all-capitalized word
 SEQCLASSIFICATION = "seq-classification"
-CLASSIFICATION = ("binary", "multi", "classification", SEQCLASSIFICATION)
+MULTICHOICECLASSIFICATION = "multichoice-classification"
+TOKENCLASSIFICATION = "token-classification"
+CLASSIFICATION = (
+    "binary",
+    "multi",
+    "classification",
+    SEQCLASSIFICATION,
+    MULTICHOICECLASSIFICATION,
+    TOKENCLASSIFICATION,
+)
 SEQREGRESSION = "seq-regression"
 REGRESSION = ("regression", SEQREGRESSION)
 TS_FORECAST = "ts_forecast"
@@ -26,11 +35,13 @@ NLG_TASKS = (SUMMARIZATION,)
 NLU_TASKS = (
    SEQREGRESSION,
    SEQCLASSIFICATION,
+    MULTICHOICECLASSIFICATION,
+    TOKENCLASSIFICATION,
 )


 def _is_nlp_task(task):
-    if task in NLU_TASKS + NLG_TASKS:
+    if task in NLU_TASKS or task in NLG_TASKS:
        return True
    else:
        return False
@@ -346,8 +357,11 @@ class DataTransformer:
                datetime_columns,
            )
            self._drop = drop
-
-        if task in CLASSIFICATION or not pd.api.types.is_numeric_dtype(y):
+        if (
+            (task in CLASSIFICATION or not pd.api.types.is_numeric_dtype(y))
+            and task not in NLG_TASKS
+            and task != TOKENCLASSIFICATION
+        ):
            from sklearn.preprocessing import LabelEncoder

            self.label_transformer = LabelEncoder()
--- a/flaml/ml.py
+++ b/flaml/ml.py
@@ -20,13 +20,18 @@ from sklearn.metrics import (
 from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
 from .model import (
    XGBoostSklearnEstimator,
+    XGBoost_TS_Regressor,
    XGBoostLimitDepthEstimator,
+    XGBoostLimitDepth_TS_Regressor,
    RandomForestEstimator,
+    RF_TS_Regressor,
    LGBMEstimator,
+    LGBM_TS_Regressor,
    LRL1Classifier,
    LRL2Classifier,
    CatBoostEstimator,
    ExtraTreesEstimator,
+    ExtraTrees_TS_Regressor,
    KNeighborsEstimator,
    Prophet,
    ARIMA,
@@ -83,18 +88,19 @@ huggingface_metric_to_mode = {
    "ter": "min",
    "wer": "min",
 }
+huggingface_submetric_to_metric = {"rouge1": "rouge", "rouge2": "rouge"}


 def get_estimator_class(task, estimator_name):
    # when adding a new learner, need to add an elif branch
    if "xgboost" == estimator_name:
-        estimator_class = XGBoostSklearnEstimator
+        estimator_class = XGBoost_TS_Regressor if TS_FORECAST == task else XGBoostSklearnEstimator
    elif "xgb_limitdepth" == estimator_name:
-        estimator_class = XGBoostLimitDepthEstimator
+        estimator_class = XGBoostLimitDepth_TS_Regressor if TS_FORECAST == task else XGBoostLimitDepthEstimator
    elif "rf" == estimator_name:
-        estimator_class = RandomForestEstimator
+        estimator_class = RF_TS_Regressor if TS_FORECAST == task else RandomForestEstimator
    elif "lgbm" == estimator_name:
-        estimator_class = LGBMEstimator
+        estimator_class = LGBM_TS_Regressor if TS_FORECAST == task else LGBMEstimator
    elif "lrl1" == estimator_name:
        estimator_class = LRL1Classifier
    elif "lrl2" == estimator_name:
@@ -102,7 +108,7 @@ def get_estimator_class(task, estimator_name):
    elif "catboost" == estimator_name:
        estimator_class = CatBoostEstimator
    elif "extra_tree" == estimator_name:
-        estimator_class = ExtraTreesEstimator
+        estimator_class = ExtraTrees_TS_Regressor if TS_FORECAST == task else ExtraTreesEstimator
    elif "kneighbor" == estimator_name:
        estimator_class = KNeighborsEstimator
    elif "prophet" in estimator_name:
@@ -153,11 +159,31 @@ def metric_loss_score(
            try:
                import datasets

-                metric = datasets.load_metric(metric_name)
-                metric_mode = huggingface_metric_to_mode[metric_name]
-                score = metric.compute(predictions=y_predict, references=y_true)[
-                    metric_name
-                ]
+                datasets_metric_name = huggingface_submetric_to_metric.get(
+                    metric_name, metric_name
+                )
+                metric = datasets.load_metric(datasets_metric_name)
+                metric_mode = huggingface_metric_to_mode[datasets_metric_name]
+
+                if "rouge" in metric_name:
+                    score = metric.compute(predictions=y_predict, references=y_true)[
+                        metric_name
+                    ].mid.fmeasure
+                elif metric_name == "seqeval":
+                    y_true = [
+                        [x for x in each_y_true if x != -100] for each_y_true in y_true
+                    ]
+                    y_pred = [
+                        y_predict[each_idx][: len(y_true[each_idx])]
+                        for each_idx in range(len(y_predict))
+                    ]
+                    score = metric.compute(predictions=y_pred, references=y_true)[
+                        "overall_accuracy"
+                    ]
+                else:
+                    score = metric.compute(predictions=y_predict, references=y_true)[
+                        metric_name
+                    ]
            except ImportError:
                raise Exception(
                    metric_name
@@ -215,6 +241,7 @@ def sklearn_metric_loss_score(
    Returns:
        score: A float number of the loss, the lower the better.
    """
+
    metric_name = metric_name.lower()

    if "r2" == metric_name:
@@ -419,10 +446,6 @@ def evaluate_model_CV(
        groups = kf.groups
        kf = kf.split(X_train_split, y_train_split, groups)
        shuffle = False
-    elif isinstance(kf, TimeSeriesSplit) and task == TS_FORECAST:
-        y_train_all = pd.DataFrame(y_train_all, columns=[TS_VALUE_COL])
-        train = X_train_all.join(y_train_all)
-        kf = kf.split(train)
    elif isinstance(kf, TimeSeriesSplit):
        kf = kf.split(X_train_split, y_train_split)
    else:
--- a/flaml/model.py
+++ b/flaml/model.py
@@ -25,6 +25,10 @@ from .data import (
    TS_VALUE_COL,
    SEQCLASSIFICATION,
    SEQREGRESSION,
+    TOKENCLASSIFICATION,
+    SUMMARIZATION,
+    NLG_TASKS,
+    MULTICHOICECLASSIFICATION,
 )

 import pandas as pd
@@ -113,7 +117,7 @@ class BaseEstimator:

    @property
    def n_features_in_(self):
-        return self.model.n_features_in_
+        return self._model.n_features_in_

    @property
    def model(self):
@@ -227,9 +231,8 @@ class BaseEstimator:
            Each element at (i,j) is the probability for instance i to be in
                class j.
        """
-        assert (
-            self._task in CLASSIFICATION
-        ), "predict_prob() only for classification task."
+        assert self._task in CLASSIFICATION, "predict_proba() only for classification."
+
        X_test = self._preprocess(X_test)
        return self._model.predict_proba(X_test)

@@ -300,9 +303,16 @@ class TransformersEstimator(BaseEstimator):
        import uuid

        self.trial_id = str(uuid.uuid1().hex)[:8]
+        if task in NLG_TASKS:
+            from transformers import Seq2SeqTrainingArguments as TrainingArguments
+        else:
+            from transformers import TrainingArguments
+        self._TrainingArguments = TrainingArguments

-    def _join(self, X_train, y_train):
-        y_train = DataFrame(y_train, columns=["label"], index=X_train.index)
+    @staticmethod
+    def _join(X_train, y_train):
+        y_train = DataFrame(y_train, index=X_train.index)
+        y_train.columns = ["label"]
        train_df = X_train.join(y_train)
        return train_df

@@ -335,12 +345,16 @@ class TransformersEstimator(BaseEstimator):
            "seed": {"domain": tune.choice(list(range(40, 45))), "init_value": 42},
            "global_max_steps": {"domain": sys.maxsize, "init_value": sys.maxsize},
        }
-        #   TODO: if self._task == SUMMARIZATION, uncomment the code below, SET the search space for
-        #    "num_beams" in search_space_dict using
-        #    search_space_dict["num_beams"] = {...}

-        # if task in NLG_TASKS:
-        #     search_space_dict["num_beams"] = {"domain": tune.choice(...)}
+        if task in NLG_TASKS:
+            search_space_dict["generation_num_beams"] = {
+                "domain": tune.randint(2, 5),
+                "init_value": 3,
+            }
+            search_space_dict["generation_max_length"] = {
+                "domain": tune.choice([16, 32, 64, 128]),
+                "init_value": 64,
+            }

        return search_space_dict

@@ -357,25 +371,34 @@ class TransformersEstimator(BaseEstimator):
            setattr(custom_hpo_args, key, val)
        self.custom_hpo_args = custom_hpo_args

-    def _preprocess(self, X, task, **kwargs):
-        from .nlp.utils import tokenize_text
+    def _preprocess(self, X, y=None, **kwargs):
+        from .nlp.utils import tokenize_text, is_a_list_of_str

-        if X.dtypes[0] == "string":
-            return tokenize_text(X, task, self.custom_hpo_args)
+        is_str = str(X.dtypes[0]) in ("string", "str")
+        is_list_of_str = is_a_list_of_str(X[list(X.keys())[0]].to_list()[0])
+
+        if is_str or is_list_of_str:
+            return tokenize_text(
+                X=X, Y=y, task=self._task, custom_hpo_args=self.custom_hpo_args
+            )
        else:
-            return X
+            return X, None
+
+    def _model_init(self, num_labels, per_model_config):
+        from .nlp.utils import load_model
+
+        return load_model(
+            checkpoint_path=self.custom_hpo_args.model_path,
+            task=self._task,
+            num_labels=num_labels,
+            per_model_config=per_model_config,
+        )

    def fit(self, X_train: DataFrame, y_train: Series, budget=None, **kwargs):
        from transformers import EarlyStoppingCallback
        from transformers.trainer_utils import set_seed
        from transformers import AutoTokenizer
-
-        #   TODO: if self._task == SUMMARIZATION, uncomment the code below (add indentation before
-        #         from transformers import TrainingArguments)
-        # if self._task in NLG_TASKS:
-        #     from transformers import Seq2SeqTrainingArguments as TrainingArguments
-        # else:
-        from transformers import TrainingArguments
+        from transformers.data import DataCollatorWithPadding

        import transformers
        from datasets import Dataset
@@ -395,6 +418,7 @@ class TransformersEstimator(BaseEstimator):
        #     from .nlp.huggingface.trainer import Seq2SeqTrainerForAuto as TrainerForAuto
        # else:
        from .nlp.huggingface.trainer import TrainerForAuto
+        from .nlp.huggingface.data_collator import DataCollatorForAuto

        this_params = self.params

@@ -429,18 +453,26 @@ class TransformersEstimator(BaseEstimator):
                    control.should_save = True
                    control.should_evaluate = True

-        set_seed(self.params.get("seed", TrainingArguments.seed))
+        set_seed(self.params.get("seed", self._TrainingArguments.seed))

        self._init_hpo_args(kwargs)
-        self._metric_name = kwargs["metric"]
-        if hasattr(self, "use_ray") is False:
-            self.use_ray = kwargs["use_ray"]
+        self._metric = kwargs["metric"]
+        self.use_ray = kwargs.get("use_ray")

        X_val = kwargs.get("X_val")
        y_val = kwargs.get("y_val")

-        X_train = self._preprocess(X_train, self._task, **kwargs)
-        train_dataset = Dataset.from_pandas(self._join(X_train, y_train))
+        if (self._task not in NLG_TASKS) and (self._task != TOKENCLASSIFICATION):
+            self._X_train, _ = self._preprocess(X=X_train, **kwargs)
+            self._y_train = y_train
+        else:
+            self._X_train, self._y_train = self._preprocess(
+                X=X_train, y=y_train, **kwargs
+            )
+
+        train_dataset = Dataset.from_pandas(
+            TransformersEstimator._join(self._X_train, self._y_train)
+        )

        # TODO: set a breakpoint here, observe the resulting train_dataset,
        #  compare it with the output of the tokenized results in your transformer example
@@ -449,33 +481,36 @@ class TransformersEstimator(BaseEstimator):
        #  make sure they are the same

        if X_val is not None:
-            X_val = self._preprocess(X_val, self._task, **kwargs)
-            eval_dataset = Dataset.from_pandas(self._join(X_val, y_val))
+            if (self._task not in NLG_TASKS) and (self._task != TOKENCLASSIFICATION):
+                self._X_val, _ = self._preprocess(X=X_val, **kwargs)
+                self._y_val = y_val
+            else:
+                self._X_val, self._y_val = self._preprocess(X=X_val, y=y_val, **kwargs)
+            eval_dataset = Dataset.from_pandas(
+                TransformersEstimator._join(self._X_val, self._y_val)
+            )
        else:
            eval_dataset = None

        tokenizer = AutoTokenizer.from_pretrained(
            self.custom_hpo_args.model_path, use_fast=True
        )
+        self._tokenizer = tokenizer

-        num_labels = get_num_labels(self._task, y_train)
+        num_labels = get_num_labels(self._task, self._y_train)

-        training_args_config, per_model_config = separate_config(self.params)
-        this_model = load_model(
-            checkpoint_path=self.custom_hpo_args.model_path,
-            task=self._task,
-            num_labels=num_labels,
-            per_model_config=per_model_config,
+        training_args_config, per_model_config = separate_config(
+            self.params, self._task
        )
        ckpt_freq = compute_checkpoint_freq(
-            train_data_size=len(X_train),
+            train_data_size=len(self._X_train),
            custom_hpo_args=self.custom_hpo_args,
            num_train_epochs=training_args_config.get(
-                "num_train_epochs", TrainingArguments.num_train_epochs
+                "num_train_epochs", self._TrainingArguments.num_train_epochs
            ),
            batch_size=training_args_config.get(
                "per_device_train_batch_size",
-                TrainingArguments.per_device_train_batch_size,
+                self._TrainingArguments.per_device_train_batch_size,
            ),
        )

@@ -492,7 +527,7 @@ class TransformersEstimator(BaseEstimator):
            trial_dir = ray.tune.get_trial_dir()

        if transformers.__version__.startswith("3"):
-            training_args = TrainingArguments(
+            training_args = self._TrainingArguments(
                report_to=[],
                output_dir=trial_dir,
                do_train=True,
@@ -508,7 +543,7 @@ class TransformersEstimator(BaseEstimator):
        else:
            from transformers import IntervalStrategy

-            training_args = TrainingArguments(
+            training_args = self._TrainingArguments(
                report_to=[],
                output_dir=trial_dir,
                do_train=True,
@@ -523,36 +558,43 @@ class TransformersEstimator(BaseEstimator):
                **training_args_config,
            )

-        def _model_init():
-            return load_model(
-                checkpoint_path=self.custom_hpo_args.model_path,
-                task=self._task,
-                num_labels=num_labels,
-                per_model_config=per_model_config,
-            )
-
-        self._model = TrainerForAuto(
-            model=this_model,
+        self._trainer = TrainerForAuto(
            args=training_args,
-            model_init=_model_init,
+            model_init=partial(self._model_init, num_labels, per_model_config),
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
+            data_collator=DataCollatorForAuto(
+                tokenizer=tokenizer,
+                pad_to_multiple_of=8 if training_args.fp16 else None,
+            )
+            if self._task == MULTICHOICECLASSIFICATION
+            else None,
            compute_metrics=self._compute_metrics_by_dataset_name,
            callbacks=[EarlyStoppingCallbackForAuto],
        )

-        setattr(self._model, "_use_ray", self.use_ray)
-        self._model.train()
+        setattr(self._trainer, "_use_ray", self.use_ray)
+        if self._task in NLG_TASKS:
+            setattr(self._trainer, "_is_seq2seq", True)
+        self._trainer.train()

-        self.params[self.ITER_HP] = self._model.state.global_step
-        self._checkpoint_path = self._select_checkpoint(self._model)
+        self.params[self.ITER_HP] = self._trainer.state.global_step
+        self._checkpoint_path = self._select_checkpoint(self._trainer)

        self._kwargs = kwargs
        self._num_labels = num_labels
        self._per_model_config = per_model_config
+        self._training_args_config = training_args_config

-        self._ckpt_remains = list(self._model.ckpt_to_metric.keys())
+        self._ckpt_remains = list(self._trainer.ckpt_to_metric.keys())
+        self._model = load_model(
+            checkpoint_path=self._checkpoint_path,
+            task=self._task,
+            num_labels=self._num_labels,
+            per_model_config=self._per_model_config,
+        )
+        self._trainer = None

    def _delete_one_ckpt(self, ckpt_location):
        if self.use_ray is False:
@@ -572,7 +614,7 @@ class TransformersEstimator(BaseEstimator):

        if trainer.ckpt_to_metric:
            best_ckpt, _ = min(
-                trainer.ckpt_to_metric.items(), key=lambda x: x[1]["val_loss"]
+                trainer.ckpt_to_metric.items(), key=lambda x: x[1]["eval_loss"]
            )
            best_ckpt_global_step = trainer.ckpt_to_global_step[best_ckpt]
            for each_ckpt in list(trainer.ckpt_to_metric):
@@ -592,77 +634,113 @@ class TransformersEstimator(BaseEstimator):
        return best_ckpt

    def _compute_metrics_by_dataset_name(self, eval_pred):
-        from .ml import metric_loss_score
+        if isinstance(self._metric, str):
+            from .ml import metric_loss_score
+            from .nlp.utils import postprocess_text

-        predictions, labels = eval_pred
-        predictions = (
-            np.squeeze(predictions)
-            if self._task == SEQREGRESSION
-            else np.argmax(predictions, axis=1)
-        )
-
-        return {
-            "val_loss": metric_loss_score(
-                metric_name=self._metric_name, y_predict=predictions, y_true=labels
+            predictions, labels = eval_pred
+            if self._task in NLG_TASKS:
+                if isinstance(predictions, tuple):
+                    predictions = np.argmax(predictions[0], axis=2)
+                decoded_preds = self._tokenizer.batch_decode(
+                    predictions, skip_special_tokens=True
+                )
+                labels = np.where(labels != -100, labels, self._tokenizer.pad_token_id)
+                decoded_labels = self._tokenizer.batch_decode(
+                    labels, skip_special_tokens=True
+                )
+                predictions, labels = postprocess_text(decoded_preds, decoded_labels)
+            else:
+                predictions = (
+                    np.squeeze(predictions)
+                    if self._task == SEQREGRESSION
+                    else np.argmax(predictions, axis=2)
+                    if self._task == TOKENCLASSIFICATION
+                    else np.argmax(predictions, axis=1)
+                )
+            return {
+                "val_loss": metric_loss_score(
+                    metric_name=self._metric, y_predict=predictions, y_true=labels
+                )
+            }
+        else:
+            agg_metric, metric_dict = self._metric(
+                X_test=self._X_val,
+                y_test=self._y_val,
+                estimator=self,
+                labels=None,
+                X_train=self._X_train,
+                y_train=self._y_train,
            )
-        }
+            return metric_dict

-    def predict_proba(self, X_test):
+    def _init_model_for_predict(self, X_test):
        from datasets import Dataset
+        from transformers import AutoTokenizer
        from .nlp.huggingface.trainer import TrainerForAuto
-        from transformers import TrainingArguments
-        from .nlp.utils import load_model
+        from .nlp.huggingface.data_collator import DataCollatorForPredict

-        assert (
-            self._task in CLASSIFICATION
-        ), "predict_proba is only available in classification tasks"
-
-        X_test = self._preprocess(X_test, self._task, **self._kwargs)
+        X_test, _ = self._preprocess(X_test, **self._kwargs)
        test_dataset = Dataset.from_pandas(X_test)
-
-        best_model = load_model(
-            checkpoint_path=self._checkpoint_path,
-            task=self._task,
-            num_labels=self._num_labels,
-            per_model_config=self._per_model_config,
-        )
-        training_args = TrainingArguments(
+        training_args = self._TrainingArguments(
            per_device_eval_batch_size=1,
            output_dir=self.custom_hpo_args.output_dir,
+            **self._training_args_config,
        )
-        self._model = TrainerForAuto(model=best_model, args=training_args)
-        predictions = self._model.predict(test_dataset)
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.custom_hpo_args.model_path, use_fast=True
+        )
+        self._trainer = TrainerForAuto(
+            model=self._model,
+            args=training_args,
+            data_collator=DataCollatorForPredict(
+                tokenizer=tokenizer,
+                pad_to_multiple_of=8 if training_args.fp16 else None,
+            )
+            if self._task == MULTICHOICECLASSIFICATION
+            else None,
+        )
+        return test_dataset, training_args
+
+    def predict_proba(self, X_test):
+        assert (
+            self._task in CLASSIFICATION
+        ), "predict_proba() only for classification tasks."
+
+        test_dataset, _ = self._init_model_for_predict(X_test)
+        predictions = self._trainer.predict(test_dataset)
+        self._trainer = None
        return predictions.predictions

    def predict(self, X_test):
-        from datasets import Dataset
-        from transformers import TrainingArguments
-        from .nlp.utils import load_model
-        from .nlp.huggingface.trainer import TrainerForAuto
-
-        X_test = self._preprocess(X_test, self._task, **self._kwargs)
-        test_dataset = Dataset.from_pandas(X_test)
-
-        best_model = load_model(
-            checkpoint_path=self._checkpoint_path,
-            task=self._task,
-            num_labels=self._num_labels,
-            per_model_config=self._per_model_config,
-        )
-        training_args = TrainingArguments(
-            per_device_eval_batch_size=1,
-            output_dir=self.custom_hpo_args.output_dir,
-        )
-        self._model = TrainerForAuto(model=best_model, args=training_args)
-        predictions = self._model.predict(test_dataset)
-
+        test_dataset, training_args = self._init_model_for_predict(X_test)
+        if self._task not in NLG_TASKS:
+            predictions = self._trainer.predict(test_dataset)
+        else:
+            predictions = self._trainer.predict(
+                test_dataset,
+                max_length=training_args.generation_max_length,
+                num_beams=training_args.generation_num_beams,
+            )
+        self._trainer = None
        if self._task == SEQCLASSIFICATION:
            return np.argmax(predictions.predictions, axis=1)
        elif self._task == SEQREGRESSION:
            return predictions.predictions.reshape((len(predictions.predictions),))
+        elif self._task == TOKENCLASSIFICATION:
+            return np.argmax(predictions.predictions, axis=2)
        # TODO: elif self._task == your task, return the corresponding prediction
        #  e.g., if your task == QUESTIONANSWERING, you need to return the answer instead
        #  of the index
+        elif self._task == SUMMARIZATION:
+            if isinstance(predictions.predictions, tuple):
+                predictions = np.argmax(predictions.predictions[0], axis=2)
+            decoded_preds = self._tokenizer.batch_decode(
+                predictions, skip_special_tokens=True
+            )
+            return decoded_preds
+        elif self._task == MULTICHOICECLASSIFICATION:
+            return np.argmax(predictions.predictions, axis=1)

    def config2params(self, config: dict) -> dict:
        params = config.copy()
@@ -1711,6 +1789,160 @@ class SARIMAX(ARIMA):
        return train_time


+class TS_SKLearn_Regressor(SKLearnEstimator):
+    """The class for tuning SKLearn Regressors for time-series forecasting, using hcrystalball"""
+
+    base_class = SKLearnEstimator
+
+    @classmethod
+    def search_space(cls, data_size, pred_horizon, **params):
+        space = cls.base_class.search_space(data_size, **params)
+        space.update(
+            {
+                "optimize_for_horizon": {
+                    "domain": tune.choice([True, False]),
+                    "init_value": False,
+                    "low_cost_init_value": False,
+                },
+                "lags": {
+                    "domain": tune.randint(lower=1, upper=data_size[0] - pred_horizon),
+                    "init_value": 3,
+                },
+            }
+        )
+        return space
+
+    def __init__(self, task=TS_FORECAST, **params):
+        super().__init__(task, **params)
+        self.hcrystaball_model = None
+
+    def transform_X(self, X):
+        cols = list(X)
+        if len(cols) == 1:
+            ds_col = cols[0]
+            X = pd.DataFrame(index=X[ds_col])
+        elif len(cols) > 1:
+            ds_col = cols[0]
+            exog_cols = cols[1:]
+            X = X[exog_cols].set_index(X[ds_col])
+        return X
+
+    def _fit(self, X_train, y_train, budget=None, **kwargs):
+        from hcrystalball.wrappers import get_sklearn_wrapper
+
+        X_train = self.transform_X(X_train)
+        X_train = self._preprocess(X_train)
+        params = self.params.copy()
+        lags = params.pop("lags")
+        optimize_for_horizon = params.pop("optimize_for_horizon")
+        estimator = self.base_class(task="regression", **params)
+        self.hcrystaball_model = get_sklearn_wrapper(estimator.estimator_class)
+        self.hcrystaball_model.lags = int(lags)
+        self.hcrystaball_model.fit(X_train, y_train)
+        if optimize_for_horizon:
+            # Direct Multi-step Forecast Strategy - fit a seperate model for each horizon
+            model_list = []
+            for i in range(1, kwargs["period"] + 1):
+                (
+                    X_fit,
+                    y_fit,
+                ) = self.hcrystaball_model._transform_data_to_tsmodel_input_format(
+                    X_train, y_train, i
+                )
+                self.hcrystaball_model.model.set_params(**estimator.params)
+                model = self.hcrystaball_model.model.fit(X_fit, y_fit)
+                model_list.append(model)
+            self._model = model_list
+        else:
+            (
+                X_fit,
+                y_fit,
+            ) = self.hcrystaball_model._transform_data_to_tsmodel_input_format(
+                X_train, y_train, kwargs["period"]
+            )
+            self.hcrystaball_model.model.set_params(**estimator.params)
+            model = self.hcrystaball_model.model.fit(X_fit, y_fit)
+            self._model = model
+
+    def fit(self, X_train, y_train, budget=None, **kwargs):
+        current_time = time.time()
+        self._fit(X_train, y_train, budget=budget, **kwargs)
+        train_time = time.time() - current_time
+        return train_time
+
+    def predict(self, X_test):
+        if self._model is not None:
+            X_test = self.transform_X(X_test)
+            X_test = self._preprocess(X_test)
+            if isinstance(self._model, list):
+                assert len(self._model) == len(
+                    X_test
+                ), "Model is optimized for horizon, length of X_test must be equal to `period`."
+                preds = []
+                for i in range(1, len(self._model) + 1):
+                    (
+                        X_pred,
+                        _,
+                    ) = self.hcrystaball_model._transform_data_to_tsmodel_input_format(
+                        X_test.iloc[:i, :]
+                    )
+                    preds.append(self._model[i - 1].predict(X_pred)[-1])
+                forecast = pd.DataFrame(
+                    data=np.asarray(preds).reshape(-1, 1),
+                    columns=[self.hcrystaball_model.name],
+                    index=X_test.index,
+                )
+            else:
+                (
+                    X_pred,
+                    _,
+                ) = self.hcrystaball_model._transform_data_to_tsmodel_input_format(
+                    X_test
+                )
+                forecast = self._model.predict(X_pred)
+            return forecast
+        else:
+            logger.warning(
+                "Estimator is not fit yet. Please run fit() before predict()."
+            )
+            return np.ones(X_test.shape[0])
+
+
+class LGBM_TS_Regressor(TS_SKLearn_Regressor):
+    """The class for tuning LGBM Regressor for time-series forecasting"""
+
+    base_class = LGBMEstimator
+
+
+class XGBoost_TS_Regressor(TS_SKLearn_Regressor):
+    """The class for tuning XGBoost Regressor for time-series forecasting"""
+
+    base_class = XGBoostSklearnEstimator
+
+
+# catboost regressor is invalid because it has a `name` parameter, making it incompatible with hcrystalball
+# class CatBoost_TS_Regressor(TS_Regressor):
+#     base_class = CatBoostEstimator
+
+
+class RF_TS_Regressor(TS_SKLearn_Regressor):
+    """The class for tuning Random Forest Regressor for time-series forecasting"""
+
+    base_class = RandomForestEstimator
+
+
+class ExtraTrees_TS_Regressor(TS_SKLearn_Regressor):
+    """The class for tuning Extra Trees Regressor for time-series forecasting"""
+
+    base_class = ExtraTreesEstimator
+
+
+class XGBoostLimitDepth_TS_Regressor(TS_SKLearn_Regressor):
+    """The class for tuning XGBoost Regressor with unlimited depth for time-series forecasting"""
+
+    base_class = XGBoostLimitDepthEstimator
+
+
 class suppress_stdout_stderr(object):
    def __init__(self):
        # Open a pair of null files
--- a/flaml/nlp/huggingface/data_collator.py
+++ b/flaml/nlp/huggingface/data_collator.py
@@ -0,0 +1,40 @@
+from dataclasses import dataclass
+from transformers.data.data_collator import DataCollatorWithPadding
+
+
+@dataclass
+class DataCollatorForAuto(DataCollatorWithPadding):
+    def __call__(self, features):
+        from itertools import chain
+        import torch
+        label_name = "label" if "label" in features[0].keys() else "labels"
+        labels = [feature.pop(label_name) for feature in features]
+        batch_size = len(features)
+        num_choices = len(features[0]["input_ids"])
+        flattened_features = [
+            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)]
+            for feature in features
+        ]
+        flattened_features = list(chain(*flattened_features))
+        batch = super(DataCollatorForAuto, self).__call__(flattened_features)
+        # Un-flatten
+        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
+        # Add back labels
+        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
+        return batch
+
+
+class DataCollatorForPredict(DataCollatorWithPadding):
+    def __call__(self, features):
+        from itertools import chain
+        batch_size = len(features)
+        num_choices = len(features[0]["input_ids"])
+        flattened_features = [
+            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)]
+            for feature in features
+        ]
+        flattened_features = list(chain(*flattened_features))
+        batch = super(DataCollatorForPredict, self).__call__(flattened_features)
+        # Un-flatten
+        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
+        return batch
--- a/flaml/nlp/huggingface/switch_head_auto.py
+++ b/flaml/nlp/huggingface/switch_head_auto.py
@@ -5,9 +5,14 @@ import transformers
 if transformers.__version__.startswith("3"):
    from transformers.modeling_electra import ElectraClassificationHead
    from transformers.modeling_roberta import RobertaClassificationHead
+    from transformers.models.electra.modeling_electra import ElectraForTokenClassification
+    from transformers.models.roberta.modeling_roberta import RobertaForTokenClassification
+
 else:
    from transformers.models.electra.modeling_electra import ElectraClassificationHead
    from transformers.models.roberta.modeling_roberta import RobertaClassificationHead
+    from transformers.models.electra.modeling_electra import ElectraForTokenClassification
+    from transformers.models.roberta.modeling_roberta import RobertaForTokenClassification

 MODEL_CLASSIFICATION_HEAD_MAPPING = OrderedDict(
    [
--- a/flaml/nlp/huggingface/trainer.py
+++ b/flaml/nlp/huggingface/trainer.py
@@ -1,19 +1,54 @@
 import os

 try:
-    from transformers import Trainer as TFTrainer
    from transformers import Seq2SeqTrainer
 except ImportError:
-    TFTrainer = object
+    Seq2SeqTrainer = object


-class TrainerForAuto(TFTrainer):
+class TrainerForAuto(Seq2SeqTrainer):
+    def predict(
+        self,
+        test_dataset,
+        ignore_keys=None,
+        metric_key_prefix=None,
+        max_length=None,
+        num_beams=None,
+    ):
+        if getattr(self, "_is_seq2seq", None):
+            return super().predict(
+                test_dataset,
+                ignore_keys,
+                metric_key_prefix,
+                max_length,
+                num_beams,
+            )
+        else:
+            return super(Seq2SeqTrainer, self).predict(
+                test_dataset, ignore_keys, metric_key_prefix
+            )
+
+    def prediction_step(
+        self,
+        model,
+        inputs,
+        prediction_loss_only,
+        ignore_keys,
+    ):
+        if getattr(self, "_is_seq2seq", None):
+            return super().prediction_step(
+                model, inputs, prediction_loss_only, ignore_keys
+            )
+        else:
+            return super(Seq2SeqTrainer, self).prediction_step(
+                model, inputs, prediction_loss_only, ignore_keys
+            )
+
    def evaluate(
        self,
        eval_dataset=None,
        ignore_keys=None,
        metric_key_prefix="eval",
-        is_seq2seq=False,
    ):
        """Overriding transformers.Trainer.evaluate by saving metrics and checkpoint path."""
        from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
@@ -25,23 +60,24 @@ class TrainerForAuto(TFTrainer):

        # TODO: if your task is seq2seq (i.e., SUMMARIZATION), uncomment the code below (add indentation before metrics = eval_dataset...

-        # if is_seq2seq:
-        #     metrics = eval_dataset and super().evaluate(
-        #         eval_dataset,
-        #         ignore_keys,
-        #         metric_key_prefix,
-        #         num_beams=self.args.num_beams,
-        #     )
-        # else:
-        metrics = eval_dataset and super().evaluate(
-            eval_dataset,
-            ignore_keys,
-            metric_key_prefix,
-        )
-        if metrics:
-            for key in list(metrics.keys()):
-                if key.startswith("eval_"):
-                    metrics[key[5:]] = metrics.pop(key)
+        if getattr(self, "_is_seq2seq", None):
+            metrics = eval_dataset and super().evaluate(
+                eval_dataset,
+                ignore_keys,
+                metric_key_prefix,
+                max_length=self.args.generation_max_length,
+                num_beams=self.args.generation_num_beams,
+            )
+        else:
+            metrics = eval_dataset and super(Seq2SeqTrainer, self).evaluate(
+                eval_dataset,
+                ignore_keys,
+                metric_key_prefix,
+            )
+        # if metrics:
+        #     for key in list(metrics.keys()):
+        #         if key.startswith("eval_"):
+        #             metrics[key[5:]] = metrics.pop(key)
        if hasattr(self, "ckpt_to_global_step"):
            self.ckpt_to_global_step[ckpt_dir] = self.state.global_step
            if metrics:
@@ -49,7 +85,7 @@ class TrainerForAuto(TFTrainer):
        else:
            self.ckpt_to_global_step = {ckpt_dir: self.state.global_step}
            self.ckpt_to_metric = {ckpt_dir: metrics} if metrics else {}
-
+        return metrics

 # TODO: if your task is SUMMARIZATION, you need a different
 #  class Seq2SeqTrainerForAuto, uncomment the code below
@@ -58,12 +94,14 @@ class TrainerForAuto(TFTrainer):
 #  Seq2SeqTrainerForAuto to make sure it's correct


-# class Seq2SeqTrainerForAuto(Seq2SeqTrainer, TrainerForAuto):
+# class Seq2SeqTrainerForAuto(TrainerForAuto):
 #     def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
 #         """Overriding transformers.Trainer.evaluate by saving metrics and checkpoint path"""
-#         super(TrainerForAuto).evaluate(
-#             eval_dataset, ignore_keys, metric_key_prefix, is_seq2seq=True
-#         )
+#         self._is_seq2seq = True
+#         TrainerForAuto.evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
+#         # super(TrainerForAuto, self).evaluate(
+#         #     eval_dataset, ignore_keys, metric_key_prefix
+#         # )


 # TODO: if your task is QUESTIONANSWERING, uncomment the code below
--- a/flaml/nlp/utils.py
+++ b/flaml/nlp/utils.py
@@ -1,17 +1,19 @@
 import argparse
 from dataclasses import dataclass, field
+from itertools import chain
 from typing import Dict, Any

 from ..data import (
    SUMMARIZATION,
    SEQREGRESSION,
    SEQCLASSIFICATION,
-    NLG_TASKS
+    MULTICHOICECLASSIFICATION,
+    TOKENCLASSIFICATION,
+    NLG_TASKS,
 )


 def load_default_huggingface_metric_for_task(task):
-    from ..data import SEQCLASSIFICATION, SEQREGRESSION

    if task == SEQCLASSIFICATION:
        return "accuracy"
@@ -19,74 +21,300 @@ def load_default_huggingface_metric_for_task(task):
        return "rmse"
    elif task == SUMMARIZATION:
        return "rouge"
-    # TODO: elif task == your task, return the default metric name for your task,
-    #  e.g., if task == MULTIPLECHOICE, return "accuracy"
-    #  notice this metric name has to be in ['accuracy', 'bertscore', 'bleu', 'bleurt',
-    #  'cer', 'chrf', 'code_eval', 'comet', 'competition_math', 'coval', 'cuad',
-    #  'f1', 'gleu', 'glue', 'google_bleu', 'indic_glue', 'matthews_correlation',
-    #  'meteor', 'pearsonr', 'precision', 'recall', 'rouge', 'sacrebleu', 'sari',
-    #  'seqeval', 'spearmanr', 'squad', 'squad_v2', 'super_glue', 'ter', 'wer',
-    #  'wiki_split', 'xnli']
+    elif task == MULTICHOICECLASSIFICATION:
+        return "accuracy"
+    elif task == TOKENCLASSIFICATION:
+        return "seqeval"


 global tokenized_column_names


-def tokenize_text(X, task, custom_hpo_task):
-    from ..data import SEQCLASSIFICATION, SEQREGRESSION
-
+def tokenize_text(X, Y=None, task=None, custom_hpo_args=None):
    if task in (SEQCLASSIFICATION, SEQREGRESSION):
-        return tokenize_text_seqclassification(X, custom_hpo_task)
-    # TODO: elif task == your task, return the tokenized result
-    #  for example, if your task == MULTIPLE CHOICE, you should
-    #  create a function named tokenize_text_multiplechoice(X, custom_hpo_args)
-    #  and what it does is the same as preprocess_function at
-    #  https://github.com/huggingface/transformers/blob/master/examples/pytorch/multiple-choice/run_swag.py#L329
+        X_tokenized, _ = tokenize_onedataframe(
+            X, this_tokenizer=None, task=task, custom_hpo_args=custom_hpo_args
+        )
+        return X_tokenized, None
+    elif task == TOKENCLASSIFICATION:
+        return tokenize_text_tokclassification(X, Y, custom_hpo_args)
+    elif task in NLG_TASKS:
+        return tokenize_seq2seq(X, Y, task=task, custom_hpo_args=custom_hpo_args)
+    elif task == MULTICHOICECLASSIFICATION:
+        return tokenize_text_multiplechoice(X, custom_hpo_args)


-def tokenize_text_seqclassification(X, custom_hpo_args):
+def tokenize_seq2seq(X, Y, task=None, custom_hpo_args=None):
+    model_inputs, tokenizer = tokenize_onedataframe(
+        X,
+        this_tokenizer=None,
+        task=task,
+        custom_hpo_args=custom_hpo_args,
+    )
+    labels = None
+    if Y is not None:
+        labels, _ = tokenize_onedataframe(
+            Y.to_frame(),
+            this_tokenizer=tokenizer,
+            task=task,
+            custom_hpo_args=custom_hpo_args,
+        )
+        labels["label"] = [
+            [(each_l if each_l != tokenizer.pad_token_id else -100) for each_l in label]
+            for label in labels["input_ids"]
+        ]
+        labels = labels.drop(
+            columns=["attention_mask", "input_ids", "decoder_input_ids"]
+        )
+    return model_inputs, labels
+
+
+def tokenize_and_align_labels(
+    examples, tokenizer, custom_hpo_args, X_sent_key, Y_sent_key=None
+):
+    global tokenized_column_names
+
+    tokenized_inputs = tokenizer(
+        [list(examples[X_sent_key])],
+        padding="max_length",
+        truncation=True,
+        max_length=custom_hpo_args.max_seq_length,
+        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
+        is_split_into_words=True,
+    )
+    if Y_sent_key is not None:
+        previous_word_idx = None
+        label_ids = []
+        import numbers
+
+        for word_idx in tokenized_inputs.word_ids(batch_index=0):
+            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
+            # ignored in the loss function.
+            if word_idx is None:
+                label_ids.append(-100)
+            # We set the label for the first token of each word.
+            elif word_idx != previous_word_idx:
+                if isinstance(examples[Y_sent_key][word_idx], numbers.Number):
+                    label_ids.append(examples[Y_sent_key][word_idx])
+                # else:
+                #     label_ids.append(label_to_id[label[word_idx]])
+            # For the other tokens in a word, we set the label to either the current label or -100, depending on
+            # the label_all_tokens flag.
+            else:
+                if isinstance(examples[Y_sent_key][word_idx], numbers.Number):
+                    label_ids.append(examples[Y_sent_key][word_idx])
+                # else:
+                #     label_ids.append(b_to_i_label[label_to_id[label[word_idx]]])
+            previous_word_idx = word_idx
+        tokenized_inputs["label"] = label_ids
+    tokenized_column_names = sorted(tokenized_inputs.keys())
+    tokenized_input_and_labels = [tokenized_inputs[x] for x in tokenized_column_names]
+    for key_idx, each_key in enumerate(tokenized_column_names):
+        if each_key != "label":
+            tokenized_input_and_labels[key_idx] = tokenized_input_and_labels[key_idx][0]
+    return tokenized_input_and_labels
+
+
+def tokenize_text_tokclassification(X, Y, custom_hpo_args):
+    from transformers import AutoTokenizer
+    import pandas as pd
+
+    global tokenized_column_names
+    this_tokenizer = AutoTokenizer.from_pretrained(
+        custom_hpo_args.model_path, use_fast=True
+    )
+    if Y is not None:
+        X_and_Y = pd.concat([X, Y.to_frame()], axis=1)
+        X_key = list(X.keys())[0]
+        Y_key = list(Y.to_frame().keys())[0]
+        X_and_Y_tokenized = X_and_Y.apply(
+            lambda x: tokenize_and_align_labels(
+                x,
+                tokenizer=this_tokenizer,
+                custom_hpo_args=custom_hpo_args,
+                X_sent_key=X_key,
+                Y_sent_key=Y_key,
+            ),
+            axis=1,
+            result_type="expand",
+        )
+        label_idx = tokenized_column_names.index("label")
+        other_indices = sorted(
+            set(range(len(tokenized_column_names))).difference({label_idx})
+        )
+        other_column_names = [tokenized_column_names[x] for x in other_indices]
+        d = X_and_Y_tokenized.iloc[:, other_indices]
+        y_tokenized = X_and_Y_tokenized.iloc[:, label_idx]
+    else:
+        X_key = list(X.keys())[0]
+        d = X.apply(
+            lambda x: tokenize_and_align_labels(
+                x,
+                tokenizer=this_tokenizer,
+                custom_hpo_args=custom_hpo_args,
+                X_sent_key=X_key,
+                Y_sent_key=None,
+            ),
+            axis=1,
+            result_type="expand",
+        )
+        other_column_names = tokenized_column_names
+        y_tokenized = None
+    X_tokenized = pd.DataFrame(columns=other_column_names)
+    X_tokenized[other_column_names] = d
+    return X_tokenized, y_tokenized
+
+
+def tokenize_onedataframe(
+    X,
+    this_tokenizer=None,
+    task=None,
+    custom_hpo_args=None,
+):
    from transformers import AutoTokenizer
    import pandas

    global tokenized_column_names

-    this_tokenizer = AutoTokenizer.from_pretrained(
-        custom_hpo_args.model_path, use_fast=True
-    )
-    d = X.apply(
-        lambda x: tokenize_glue(x, this_tokenizer, custom_hpo_args),
-        axis=1,
-        result_type="expand",
-    )
+    if this_tokenizer:
+        with this_tokenizer.as_target_tokenizer():
+            d = X.apply(
+                lambda x: tokenize_row(
+                    x,
+                    this_tokenizer,
+                    prefix=("",) if task is SUMMARIZATION else None,
+                    task=task,
+                    custom_hpo_args=custom_hpo_args,
+                ),
+                axis=1,
+                result_type="expand",
+            )
+    else:
+        this_tokenizer = AutoTokenizer.from_pretrained(
+            custom_hpo_args.model_path, use_fast=True
+        )
+        d = X.apply(
+            lambda x: tokenize_row(
+                x,
+                this_tokenizer,
+                prefix=("summarize: ",) if task is SUMMARIZATION else None,
+                task=task,
+                custom_hpo_args=custom_hpo_args,
+            ),
+            axis=1,
+            result_type="expand",
+        )
    X_tokenized = pandas.DataFrame(columns=tokenized_column_names)
    X_tokenized[tokenized_column_names] = d
-    return X_tokenized
+    return X_tokenized, this_tokenizer


-def tokenize_glue(this_row, this_tokenizer, custom_hpo_args):
+def postprocess_text(preds, labels):
+    import nltk
+
+    nltk.download("punkt")
+    preds = [pred.strip() for pred in preds]
+    labels = [label.strip() for label in labels]
+
+    # rougeLSum expects newline after each sentence
+    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
+    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
+
+    return preds, labels
+
+
+def tokenize_row(
+    this_row, this_tokenizer, prefix=None, task=None, custom_hpo_args=None
+):
    global tokenized_column_names
    assert (
        "max_seq_length" in custom_hpo_args.__dict__
    ), "max_seq_length must be provided for glue"

+    if prefix:
+        this_row = tuple(["".join(x) for x in zip(prefix, this_row)])
+
    tokenized_example = this_tokenizer(
        *tuple(this_row),
        padding="max_length",
        max_length=custom_hpo_args.max_seq_length,
        truncation=True,
    )
+    if task in NLG_TASKS:
+        tokenized_example["decoder_input_ids"] = tokenized_example["input_ids"]
    tokenized_column_names = sorted(tokenized_example.keys())
    return [tokenized_example[x] for x in tokenized_column_names]


-def separate_config(config):
-    from transformers import TrainingArguments
+def tokenize_text_multiplechoice(X, custom_hpo_args):
+    from transformers import AutoTokenizer
+    import pandas
+
+    global tokenized_column_names
+
+    this_tokenizer = AutoTokenizer.from_pretrained(
+        custom_hpo_args.model_path,  # 'roberta-base'
+        cache_dir=None,
+        use_fast=True,
+        revision="main",
+        use_auth_token=None,
+    )
+    t = X[["sent1", "sent2", "ending0", "ending1", "ending2", "ending3"]]
+    d = t.apply(
+        lambda x: tokenize_swag(x, this_tokenizer, custom_hpo_args),
+        axis=1,
+        result_type="expand",
+    )
+
+    X_tokenized = pandas.DataFrame(columns=tokenized_column_names)
+    X_tokenized[tokenized_column_names] = d
+    output = X_tokenized.join(X)
+    return output, None
+
+
+def tokenize_swag(this_row, this_tokenizer, custom_hpo_args):
+    global tokenized_column_names
+
+    first_sentences = [[this_row["sent1"]] * 4]
+    # get each 1st sentence, multiply to 4 sentences
+    question_headers = this_row["sent2"]
+    # sent2 are the noun part of 2nd line
+    second_sentences = [
+        question_headers + " " + this_row[key]
+        for key in ["ending0", "ending1", "ending2", "ending3"]
+    ]
+    # now the 2nd-sentences are formed by combing the noun part and 4 ending parts
+
+    # Flatten out
+    # From 2 dimension to 1 dimension array
+    first_sentences = list(chain(*first_sentences))
+
+    tokenized_example = this_tokenizer(
+        *tuple([first_sentences, second_sentences]),
+        truncation=True,
+        max_length=custom_hpo_args.max_seq_length,
+        padding=False,
+    )
+    tokenized_column_names = sorted(tokenized_example.keys())
+    return [tokenized_example[x] for x in tokenized_column_names]
+
+
+def separate_config(config, task):
+    if task in NLG_TASKS:
+        from transformers import Seq2SeqTrainingArguments, TrainingArguments
+
+        trainargs_class_list = [Seq2SeqTrainingArguments, TrainingArguments]
+    else:
+        from transformers import TrainingArguments
+
+        trainargs_class_list = [TrainingArguments]

    training_args_config = {}
    per_model_config = {}

    for key, val in config.items():
-        if key in TrainingArguments.__dict__:
+        is_in_training_args = any(key in x.__dict__ for x in trainargs_class_list)
+        if is_in_training_args:
            training_args_config[key] = val
        else:
            per_model_config[key] = val
@@ -95,16 +323,22 @@ def separate_config(config):


 def get_num_labels(task, y_train):
-    from ..data import SEQCLASSIFICATION, SEQREGRESSION
+    from ..data import SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION

    if task == SEQREGRESSION:
        return 1
    elif task == SEQCLASSIFICATION:
        return len(set(y_train))
+    elif task == TOKENCLASSIFICATION:
+        return len(set([a for b in y_train.tolist() for a in b]))
    else:
        return None


+def is_a_list_of_str(this_obj):
+    return isinstance(this_obj, list) and all(isinstance(x, str) for x in this_obj)
+
+
 def _clean_value(value: Any) -> str:
    if isinstance(value, float):
        return "{:.5}".format(value)
@@ -171,29 +405,40 @@ def load_model(checkpoint_path, task, num_labels, per_model_config=None):
        AutoSeqClassificationHead,
        MODEL_CLASSIFICATION_HEAD_MAPPING,
    )
-    from ..data import SEQCLASSIFICATION, SEQREGRESSION
+    from ..data import SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION

    this_model_type = AutoConfig.from_pretrained(checkpoint_path).model_type
    this_vocab_size = AutoConfig.from_pretrained(checkpoint_path).vocab_size

-    def get_this_model():
+    def get_this_model(task):
        from transformers import AutoModelForSequenceClassification
+        from transformers import AutoModelForSeq2SeqLM
+        from transformers import AutoModelForMultipleChoice
+        from transformers import AutoModelForTokenClassification

        if task in (SEQCLASSIFICATION, SEQREGRESSION):
            return AutoModelForSequenceClassification.from_pretrained(
                checkpoint_path, config=model_config
            )
-        # TODO: elif task == your task, fill in the line in your transformers example
-        #  that loads the model, e.g., if task == MULTIPLE CHOICE, according to
-        #  https://github.com/huggingface/transformers/blob/master/examples/pytorch/multiple-choice/run_swag.py#L298
-        #  you can return AutoModelForMultipleChoice.from_pretrained(checkpoint_path, config=model_config)
+        elif task == TOKENCLASSIFICATION:
+            return AutoModelForTokenClassification.from_pretrained(
+                checkpoint_path, config=model_config
+            )
+        elif task in NLG_TASKS:
+            return AutoModelForSeq2SeqLM.from_pretrained(
+                checkpoint_path, config=model_config
+            )
+        elif task == MULTICHOICECLASSIFICATION:
+            return AutoModelForMultipleChoice.from_pretrained(
+                checkpoint_path, config=model_config
+            )

    def is_pretrained_model_in_classification_head_list(model_type):
        return model_type in MODEL_CLASSIFICATION_HEAD_MAPPING

    def _set_model_config(checkpoint_path):
-        if task in (SEQCLASSIFICATION, SEQREGRESSION):
-            if per_model_config and len(per_model_config) > 0:
+        if task in (SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION):
+            if per_model_config:
                model_config = AutoConfig.from_pretrained(
                    checkpoint_path,
                    num_labels=model_config_num_labels,
@@ -204,18 +449,15 @@ def load_model(checkpoint_path, task, num_labels, per_model_config=None):
                    checkpoint_path, num_labels=model_config_num_labels
                )
            return model_config
-        # TODO: elif task == your task, uncomment the code below:
-        # else:
-        #     if per_model_config and len(per_model_config) > 0:
-        #         model_config = AutoConfig.from_pretrained(
-        #             checkpoint_path,
-        #             **per_model_config,
-        #         )
-        #     else:
-        #         model_config = AutoConfig.from_pretrained(
-        #             checkpoint_path
-        #         )
-        #     return model_config
+        else:
+            if per_model_config:
+                model_config = AutoConfig.from_pretrained(
+                    checkpoint_path,
+                    **per_model_config,
+                )
+            else:
+                model_config = AutoConfig.from_pretrained(checkpoint_path)
+            return model_config

    if task == SEQCLASSIFICATION:
        num_labels_old = AutoConfig.from_pretrained(checkpoint_path).num_labels
@@ -227,7 +469,7 @@ def load_model(checkpoint_path, task, num_labels, per_model_config=None):

        if is_pretrained_model_in_classification_head_list(this_model_type):
            if num_labels != num_labels_old:
-                this_model = get_this_model()
+                this_model = get_this_model(task)
                model_config.num_labels = num_labels
                this_model.num_labels = num_labels
                this_model.classifier = (
@@ -236,16 +478,18 @@ def load_model(checkpoint_path, task, num_labels, per_model_config=None):
                    )
                )
            else:
-                this_model = get_this_model()
+                this_model = get_this_model(task)
        else:
-            this_model = get_this_model()
+            this_model = get_this_model(task)
        this_model.resize_token_embeddings(this_vocab_size)
        return this_model
    else:
        if task == SEQREGRESSION:
            model_config_num_labels = 1
+        elif task == TOKENCLASSIFICATION:
+            model_config_num_labels = num_labels
        model_config = _set_model_config(checkpoint_path)
-        this_model = get_this_model()
+        this_model = get_this_model(task)
        return this_model


@@ -270,7 +514,6 @@ def compute_checkpoint_freq(
@dataclass
 class HPOArgs:
    """The HPO setting.
-
    Args:
        output_dir (str): data root directory for outputing the log, etc.
        model_path (str, optional, defaults to "facebook/muppet-roberta-base"): A string,
@@ -279,7 +522,6 @@ class HPOArgs:
        fp16 (bool, optional, defaults to "False"): A bool, whether to use FP16.
        max_seq_length (int, optional, defaults to 128): An integer, the max length of the sequence.
        ckpt_per_epoch (int, optional, defaults to 1): An integer, the number of checkpoints per epoch.
-
    """

    output_dir: str = field(
@@ -295,6 +537,15 @@ class HPOArgs:

    max_seq_length: int = field(default=128, metadata={"help": "max seq length"})

+    pad_to_max_length: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to pad all samples to model maximum sentence length. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+            "efficient on GPU but very bad for TPU."
+        },
+    )
+
    ckpt_per_epoch: int = field(default=1, metadata={"help": "checkpoint per epoch"})

    @staticmethod
--- a/flaml/scheduler/init.py
+++ b/flaml/scheduler/init.py
@@ -1,2 +1,6 @@
 from .trial_scheduler import TrialScheduler
-from .online_scheduler import OnlineScheduler, OnlineSuccessiveDoublingScheduler, ChaChaScheduler
+from .online_scheduler import (
+    OnlineScheduler,
+    OnlineSuccessiveDoublingScheduler,
+    ChaChaScheduler,
+)
--- a/flaml/searcher/blendsearch.py
+++ b/flaml/searcher/blendsearch.py
@@ -113,7 +113,7 @@ class BlendSearch(Searcher):
                "For cost-frugal search, "
                "consider providing low-cost values for cost-related hps via "
                "'low_cost_partial_config'. More info can be found at "
-                "https://github.com/microsoft/FLAML/wiki/About-%60low_cost_partial_config%60"
+                "https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune"
            )
        if evaluated_rewards and mode:
            self._points_to_evaluate = []
--- a/flaml/searcher/flow2.py
+++ b/flaml/searcher/flow2.py
@@ -2,27 +2,28 @@
 #  * Copyright (c) Microsoft Corporation. All rights reserved.
 #  * Licensed under the MIT License. See LICENSE file in the
 #  * project root for license information.
-from flaml.tune.sample import Domain
 from typing import Dict, Optional, Tuple
 import numpy as np
+import logging

 try:
    from ray import __version__ as ray_version

    assert ray_version >= "1.0.0"
    from ray.tune.suggest import Searcher
-    from ray.tune.suggest.variant_generator import generate_variants
    from ray.tune import sample
    from ray.tune.utils.util import flatten_dict, unflatten_dict
 except (ImportError, AssertionError):
    from .suggestion import Searcher
-    from .variant_generator import generate_variants
    from ..tune import sample
    from ..tune.trial import flatten_dict, unflatten_dict
-from ..tune.space import complete_config, denormalize, normalize
-
-
-import logging
+from flaml.tune.sample import _BackwardsCompatibleNumpyRng
+from ..tune.space import (
+    complete_config,
+    denormalize,
+    normalize,
+    generate_variants_compatible,
+)

 logger = logging.getLogger(__name__)

@@ -84,6 +85,7 @@ class FLOW2(Searcher):
        self.space = space or {}
        self._space = flatten_dict(self.space, prevent_delimiter=True)
        self._random = np.random.RandomState(seed)
+        self.rs_random = _BackwardsCompatibleNumpyRng(seed + 19823)
        self.seed = seed
        self.init_config = init_config
        self.best_config = flatten_dict(init_config)
@@ -464,8 +466,8 @@ class FLOW2(Searcher):
            # random
            for i, key in enumerate(self._tunable_keys):
                if self._direction_tried[i] != 0:
-                    for _, generated in generate_variants(
-                        {"config": {key: self._space[key]}}
+                    for _, generated in generate_variants_compatible(
+                        {"config": {key: self._space[key]}}, random_state=self.rs_random
                    ):
                        if generated["config"][key] != best_config[key]:
                            config[key] = generated["config"][key]
--- a/flaml/searcher/suggestion.py
+++ b/flaml/searcher/suggestion.py
@@ -178,7 +178,7 @@ class ConcurrencyLimiter(Searcher):
        batch (bool): Whether to wait for all concurrent samples
            to finish before updating the underlying searcher.
    Example:
-    ```python 
+    ```python
    from ray.tune.suggest import ConcurrencyLimiter
    search_alg = HyperOptSearch(metric="accuracy")
    search_alg = ConcurrencyLimiter(search_alg, max_concurrent=2)
@@ -366,81 +366,81 @@ class _OptunaTrialSuggestCaptor:

 class OptunaSearch(Searcher):
    """A wrapper around Optuna to provide trial suggestions.
-    [Optuna](https://optuna.org/)
-    is a hyperparameter optimization library.
-    In contrast to other libraries, it employs define-by-run style
-    hyperparameter definitions.
-    This Searcher is a thin wrapper around Optuna's search algorithms.
-    You can pass any Optuna sampler, which will be used to generate
-    hyperparameter suggestions.
-    Args:
-        space (dict|Callable): Hyperparameter search space definition for
-            Optuna's sampler. This can be either a class `dict` with
-            parameter names as keys and ``optuna.distributions`` as values,
-            or a Callable - in which case, it should be a define-by-run
-            function using ``optuna.trial`` to obtain the hyperparameter
-            values. The function should return either a class `dict` of
-            constant values with names as keys, or None.
-            For more information, see 
-            [tutorial](https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/002_configurations.html).
-            Warning - No actual computation should take place in the define-by-run
-            function. Instead, put the training logic inside the function
-            or class trainable passed to tune.run.
-        metric (str): The training result objective value attribute. If None
-            but a mode was passed, the anonymous metric `_metric` will be used
-            per default.
-        mode (str): One of {min, max}. Determines whether objective is
-            minimizing or maximizing the metric attribute.
-        points_to_evaluate (list): Initial parameter suggestions to be run
-            first. This is for when you already have some good parameters
-            you want to run first to help the algorithm make better suggestions
-            for future parameters. Needs to be a list of dicts containing the
-            configurations.
-        sampler (optuna.samplers.BaseSampler): Optuna sampler used to
-            draw hyperparameter configurations. Defaults to ``TPESampler``.
-        seed (int): Seed to initialize sampler with. This parameter is only
-            used when ``sampler=None``. In all other cases, the sampler
-            you pass should be initialized with the seed already.
-        evaluated_rewards (list): If you have previously evaluated the
-            parameters passed in as points_to_evaluate you can avoid
-            re-running those trials by passing in the reward attributes
-            as a list so the optimiser can be told the results without
-            needing to re-compute the trial. Must be the same length as
-            points_to_evaluate.
+        [Optuna](https://optuna.org/)
+        is a hyperparameter optimization library.
+        In contrast to other libraries, it employs define-by-run style
+        hyperparameter definitions.
+        This Searcher is a thin wrapper around Optuna's search algorithms.
+        You can pass any Optuna sampler, which will be used to generate
+        hyperparameter suggestions.
+        Args:
+            space (dict|Callable): Hyperparameter search space definition for
+                Optuna's sampler. This can be either a class `dict` with
+                parameter names as keys and ``optuna.distributions`` as values,
+                or a Callable - in which case, it should be a define-by-run
+                function using ``optuna.trial`` to obtain the hyperparameter
+                values. The function should return either a class `dict` of
+                constant values with names as keys, or None.
+                For more information, see
+                [tutorial](https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/002_configurations.html).
+                Warning - No actual computation should take place in the define-by-run
+                function. Instead, put the training logic inside the function
+                or class trainable passed to tune.run.
+            metric (str): The training result objective value attribute. If None
+                but a mode was passed, the anonymous metric `_metric` will be used
+                per default.
+            mode (str): One of {min, max}. Determines whether objective is
+                minimizing or maximizing the metric attribute.
+            points_to_evaluate (list): Initial parameter suggestions to be run
+                first. This is for when you already have some good parameters
+                you want to run first to help the algorithm make better suggestions
+                for future parameters. Needs to be a list of dicts containing the
+                configurations.
+            sampler (optuna.samplers.BaseSampler): Optuna sampler used to
+                draw hyperparameter configurations. Defaults to ``TPESampler``.
+            seed (int): Seed to initialize sampler with. This parameter is only
+                used when ``sampler=None``. In all other cases, the sampler
+                you pass should be initialized with the seed already.
+            evaluated_rewards (list): If you have previously evaluated the
+                parameters passed in as points_to_evaluate you can avoid
+                re-running those trials by passing in the reward attributes
+                as a list so the optimiser can be told the results without
+                needing to re-compute the trial. Must be the same length as
+                points_to_evaluate.

-    Tune automatically converts search spaces to Optuna's format:
+        Tune automatically converts search spaces to Optuna's format:

-````python
-from ray.tune.suggest.optuna import OptunaSearch
-config = { "a": tune.uniform(6, 8),
-           "b": tune.loguniform(1e-4, 1e-2)}
-optuna_search = OptunaSearch(metric="loss", mode="min")
-tune.run(trainable, config=config, search_alg=optuna_search)
-````
+    ````python
+    from ray.tune.suggest.optuna import OptunaSearch
+    config = { "a": tune.uniform(6, 8),
+               "b": tune.loguniform(1e-4, 1e-2)}
+    optuna_search = OptunaSearch(metric="loss", mode="min")
+    tune.run(trainable, config=config, search_alg=optuna_search)
+    ````

-    If you would like to pass the search space manually, the code would
-    look like this:
+        If you would like to pass the search space manually, the code would
+        look like this:

-```python
-from ray.tune.suggest.optuna import OptunaSearch
-import optuna
-config = { "a": optuna.distributions.UniformDistribution(6, 8),
-           "b": optuna.distributions.LogUniformDistribution(1e-4, 1e-2)}
-optuna_search = OptunaSearch(space,metric="loss",mode="min")
-tune.run(trainable, search_alg=optuna_search)
-# Equivalent Optuna define-by-run function approach:
-def define_search_space(trial: optuna.Trial):
-    trial.suggest_float("a", 6, 8)
-    trial.suggest_float("b", 1e-4, 1e-2, log=True)
-    # training logic goes into trainable, this is just
-    # for search space definition
-optuna_search = OptunaSearch(
-    define_search_space,
-    metric="loss",
-    mode="min")
-tune.run(trainable, search_alg=optuna_search)
-.. versionadded:: 0.8.8
-```
+    ```python
+    from ray.tune.suggest.optuna import OptunaSearch
+    import optuna
+    config = { "a": optuna.distributions.UniformDistribution(6, 8),
+               "b": optuna.distributions.LogUniformDistribution(1e-4, 1e-2)}
+    optuna_search = OptunaSearch(space,metric="loss",mode="min")
+    tune.run(trainable, search_alg=optuna_search)
+    # Equivalent Optuna define-by-run function approach:
+    def define_search_space(trial: optuna.Trial):
+        trial.suggest_float("a", 6, 8)
+        trial.suggest_float("b", 1e-4, 1e-2, log=True)
+        # training logic goes into trainable, this is just
+        # for search space definition
+    optuna_search = OptunaSearch(
+        define_search_space,
+        metric="loss",
+        mode="min")
+    tune.run(trainable, search_alg=optuna_search)
+    .. versionadded:: 0.8.8
+    ```

    """

--- a/flaml/searcher/variant_generator.py
+++ b/flaml/searcher/variant_generator.py
@@ -18,11 +18,9 @@
 import copy
 import logging
 from typing import Any, Dict, Generator, List, Tuple
-
 import numpy
 import random
-
-from ..tune.sample import Categorical, Domain
+from ..tune.sample import Categorical, Domain, RandomState

 logger = logging.getLogger(__name__)

@@ -35,6 +33,8 @@ class TuneError(Exception):

 def generate_variants(
    unresolved_spec: Dict,
+    constant_grid_search: bool = False,
+    random_state: "RandomState" = None,
 ) -> Generator[Tuple[Dict, Dict], None, None]:
    """Generates variants from a spec (dict) with unresolved values.
    There are two types of unresolved values:
@@ -43,14 +43,25 @@ def generate_variants(
        variants in combination:
            "activation": grid_search(["relu", "tanh"])
            "learning_rate": grid_search([1e-3, 1e-4, 1e-5])
+        Lambda functions: These are evaluated to produce a concrete value, and
+        can express dependencies or conditional distributions between values.
+        They can also be used to express random search (e.g., by calling
+        into the `random` or `np` module).
+            "cpu": lambda spec: spec.config.num_workers
+            "batch_size": lambda spec: random.uniform(1, 1000)
    Finally, to support defining specs in plain JSON / YAML, grid search
-    can also be defined alternatively as follows:
+    and lambda functions can also be defined alternatively as follows:
        "activation": {"grid_search": ["relu", "tanh"]}
+        "cpu": {"eval": "spec.config.num_workers"}
    Use `format_vars` to format the returned dict of hyperparameters.
    Yields:
        (Dict of resolved variables, Spec object)
    """
-    for resolved_vars, spec in _generate_variants(unresolved_spec):
+    for resolved_vars, spec in _generate_variants(
+        unresolved_spec,
+        constant_grid_search=constant_grid_search,
+        random_state=random_state,
+    ):
        assert not _unresolved_values(spec)
        yield resolved_vars, spec

@@ -93,7 +104,9 @@ def parse_spec_vars(
    return resolved_vars, domain_vars, grid_vars


-def _generate_variants(spec: Dict) -> Tuple[Dict, Dict]:
+def _generate_variants(
+    spec: Dict, constant_grid_search: bool = False, random_state: "RandomState" = None
+) -> Tuple[Dict, Dict]:
    spec = copy.deepcopy(spec)
    _, domain_vars, grid_vars = parse_spec_vars(spec)

@@ -101,10 +114,34 @@ def _generate_variants(spec: Dict) -> Tuple[Dict, Dict]:
        yield {}, spec
        return

+    # Variables to resolve
+    to_resolve = domain_vars
+
+    all_resolved = True
+    if constant_grid_search:
+        # In this path, we first sample random variables and keep them constant
+        # for grid search.
+        # `_resolve_domain_vars` will alter `spec` directly
+        all_resolved, resolved_vars = _resolve_domain_vars(
+            spec, domain_vars, allow_fail=True, random_state=random_state
+        )
+        if not all_resolved:
+            # Not all variables have been resolved, but remove those that have
+            # from the `to_resolve` list.
+            to_resolve = [(r, d) for r, d in to_resolve if r not in resolved_vars]
    grid_search = _grid_search_generator(spec, grid_vars)
    for resolved_spec in grid_search:
-        resolved_vars = _resolve_domain_vars(resolved_spec, domain_vars)
-        for resolved, spec in _generate_variants(resolved_spec):
+        if not constant_grid_search or not all_resolved:
+            # In this path, we sample the remaining random variables
+            _, resolved_vars = _resolve_domain_vars(
+                resolved_spec, to_resolve, random_state=random_state
+            )
+
+        for resolved, spec in _generate_variants(
+            resolved_spec,
+            constant_grid_search=constant_grid_search,
+            random_state=random_state,
+        ):
            for path, value in grid_vars:
                resolved_vars[path] = _get_value(spec, path)
            for k, v in resolved.items():
@@ -134,7 +171,12 @@ def _get_value(spec: Dict, path: Tuple) -> Any:
    return spec


-def _resolve_domain_vars(spec: Dict, domain_vars: List[Tuple[Tuple, Domain]]) -> Dict:
+def _resolve_domain_vars(
+    spec: Dict,
+    domain_vars: List[Tuple[Tuple, Domain]],
+    allow_fail: bool = False,
+    random_state: "RandomState" = None,
+) -> Tuple[bool, Dict]:
    resolved = {}
    error = True
    num_passes = 0
@@ -145,7 +187,9 @@ def _resolve_domain_vars(spec: Dict, domain_vars: List[Tuple[Tuple, Domain]]) ->
            if path in resolved:
                continue
            try:
-                value = domain.sample(_UnresolvedAccessGuard(spec))
+                value = domain.sample(
+                    _UnresolvedAccessGuard(spec), random_state=random_state
+                )
            except RecursiveDependencyError as e:
                error = e
            except Exception:
@@ -156,8 +200,11 @@ def _resolve_domain_vars(spec: Dict, domain_vars: List[Tuple[Tuple, Domain]]) ->
                assign_value(spec, path, value)
                resolved[path] = value
    if error:
-        raise error
-    return resolved
+        if not allow_fail:
+            raise error
+        else:
+            return False, resolved
+    return True, resolved


 def _grid_search_generator(
--- a/flaml/tune/cgmanifest.json
+++ b/flaml/tune/cgmanifest.json
@@ -1,8 +1,8 @@
 {
-    "Registrations": [ 
+    "Registrations": [
        {
-            "Component": { 
-                "Type": "pip", 
+            "Component": {
+                "Type": "pip",
                "pip": {"Name": "ray[tune]", "Version": "1.5.1" }
            },
            "DevelopmentDependency": false
--- a/flaml/tune/sample.py
+++ b/flaml/tune/sample.py
@@ -12,21 +12,76 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# This source file is included here because ray does not fully support Windows.
+# This source file is adapted here because ray does not fully support Windows.

 # Copyright (c) Microsoft Corporation.
 import logging
-import random
 from copy import copy
-from inspect import signature
 from math import isclose
-from typing import Any, Callable, Dict, List, Optional, Sequence, Union
-
+from typing import Any, Dict, List, Optional, Sequence, Union
 import numpy as np

+# Backwards compatibility
+try:
+    # Added in numpy>=1.17 but we require numpy>=1.16
+    np_random_generator = np.random.Generator
+    LEGACY_RNG = False
+except AttributeError:
+
+    class np_random_generator:
+        pass
+
+    LEGACY_RNG = True
+
 logger = logging.getLogger(__name__)


+class _BackwardsCompatibleNumpyRng:
+    """Thin wrapper to ensure backwards compatibility between
+    new and old numpy randomness generators.
+    """
+
+    _rng = None
+
+    def __init__(
+        self,
+        generator_or_seed: Optional[
+            Union["np_random_generator", np.random.RandomState, int]
+        ] = None,
+    ):
+        if generator_or_seed is None or isinstance(
+            generator_or_seed, (np.random.RandomState, np_random_generator)
+        ):
+            self._rng = generator_or_seed
+        elif LEGACY_RNG:
+            self._rng = np.random.RandomState(generator_or_seed)
+        else:
+            self._rng = np.random.default_rng(generator_or_seed)
+
+    @property
+    def legacy_rng(self) -> bool:
+        return not isinstance(self._rng, np_random_generator)
+
+    @property
+    def rng(self):
+        # don't set self._rng to np.random to avoid picking issues
+        return self._rng if self._rng is not None else np.random
+
+    def __getattr__(self, name: str) -> Any:
+        # https://numpy.org/doc/stable/reference/random/new-or-different.html
+        if self.legacy_rng:
+            if name == "integers":
+                name = "randint"
+            elif name == "random":
+                name = "rand"
+        return getattr(self.rng, name)
+
+
+RandomState = Union[
+    None, _BackwardsCompatibleNumpyRng, np_random_generator, np.random.RandomState, int
+]
+
+
 class Domain:
    """Base class to specify a type and valid range to sample parameters from.
    This base class is implemented by parameter spaces, like float ranges
@@ -61,9 +116,16 @@ class Domain:
            sampler = self.default_sampler_cls()
        return sampler

-    def sample(self, spec=None, size=1):
+    def sample(
+        self,
+        spec: Optional[Union[List[Dict], Dict]] = None,
+        size: int = 1,
+        random_state: "RandomState" = None,
+    ):
+        if not isinstance(random_state, _BackwardsCompatibleNumpyRng):
+            random_state = _BackwardsCompatibleNumpyRng(random_state)
        sampler = self.get_sampler()
-        return sampler.sample(self, spec=spec, size=size)
+        return sampler.sample(self, spec=spec, size=size, random_state=random_state)

    def is_grid(self):
        return isinstance(self.sampler, Grid)
@@ -86,6 +148,7 @@ class Sampler:
        domain: Domain,
        spec: Optional[Union[List[Dict], Dict]] = None,
        size: int = 1,
+        random_state: "RandomState" = None,
    ):
        raise NotImplementedError

@@ -128,6 +191,7 @@ class Grid(Sampler):
        domain: Domain,
        spec: Optional[Union[List[Dict], Dict]] = None,
        size: int = 1,
+        random_state: "RandomState" = None,
    ):
        return RuntimeError("Do not call `sample()` on grid.")

@@ -139,10 +203,13 @@ class Float(Domain):
            domain: "Float",
            spec: Optional[Union[List[Dict], Dict]] = None,
            size: int = 1,
+            random_state: "RandomState" = None,
        ):
+            if not isinstance(random_state, _BackwardsCompatibleNumpyRng):
+                random_state = _BackwardsCompatibleNumpyRng(random_state)
            assert domain.lower > float("-inf"), "Uniform needs a lower bound"
            assert domain.upper < float("inf"), "Uniform needs a upper bound"
-            items = np.random.uniform(domain.lower, domain.upper, size=size)
+            items = random_state.uniform(domain.lower, domain.upper, size=size)
            return items if len(items) > 1 else domain.cast(items[0])

    class _LogUniform(LogUniform):
@@ -151,7 +218,10 @@ class Float(Domain):
            domain: "Float",
            spec: Optional[Union[List[Dict], Dict]] = None,
            size: int = 1,
+            random_state: "RandomState" = None,
        ):
+            if not isinstance(random_state, _BackwardsCompatibleNumpyRng):
+                random_state = _BackwardsCompatibleNumpyRng(random_state)
            assert domain.lower > 0, "LogUniform needs a lower bound greater than 0"
            assert (
                0 < domain.upper < float("inf")
@@ -159,7 +229,7 @@ class Float(Domain):
            logmin = np.log(domain.lower) / np.log(self.base)
            logmax = np.log(domain.upper) / np.log(self.base)

-            items = self.base ** (np.random.uniform(logmin, logmax, size=size))
+            items = self.base ** (random_state.uniform(logmin, logmax, size=size))
            return items if len(items) > 1 else domain.cast(items[0])

    class _Normal(Normal):
@@ -168,14 +238,17 @@ class Float(Domain):
            domain: "Float",
            spec: Optional[Union[List[Dict], Dict]] = None,
            size: int = 1,
+            random_state: "RandomState" = None,
        ):
+            if not isinstance(random_state, _BackwardsCompatibleNumpyRng):
+                random_state = _BackwardsCompatibleNumpyRng(random_state)
            assert not domain.lower or domain.lower == float(
                "-inf"
            ), "Normal sampling does not allow a lower value bound."
            assert not domain.upper or domain.upper == float(
                "inf"
            ), "Normal sampling does not allow a upper value bound."
-            items = np.random.normal(self.mean, self.sd, size=size)
+            items = random_state.normal(self.mean, self.sd, size=size)
            return items if len(items) > 1 else domain.cast(items[0])

    default_sampler_cls = _Uniform
@@ -262,8 +335,11 @@ class Integer(Domain):
            domain: "Integer",
            spec: Optional[Union[List[Dict], Dict]] = None,
            size: int = 1,
+            random_state: "RandomState" = None,
        ):
-            items = np.random.randint(domain.lower, domain.upper, size=size)
+            if not isinstance(random_state, _BackwardsCompatibleNumpyRng):
+                random_state = _BackwardsCompatibleNumpyRng(random_state)
+            items = random_state.integers(domain.lower, domain.upper, size=size)
            return items if len(items) > 1 else domain.cast(items[0])

    class _LogUniform(LogUniform):
@@ -272,7 +348,10 @@ class Integer(Domain):
            domain: "Integer",
            spec: Optional[Union[List[Dict], Dict]] = None,
            size: int = 1,
+            random_state: "RandomState" = None,
        ):
+            if not isinstance(random_state, _BackwardsCompatibleNumpyRng):
+                random_state = _BackwardsCompatibleNumpyRng(random_state)
            assert domain.lower > 0, "LogUniform needs a lower bound greater than 0"
            assert (
                0 < domain.upper < float("inf")
@@ -280,8 +359,8 @@ class Integer(Domain):
            logmin = np.log(domain.lower) / np.log(self.base)
            logmax = np.log(domain.upper) / np.log(self.base)

-            items = self.base ** (np.random.uniform(logmin, logmax, size=size))
-            items = np.round(items).astype(int)
+            items = self.base ** (random_state.uniform(logmin, logmax, size=size))
+            items = np.floor(items).astype(int)
            return items if len(items) > 1 else domain.cast(items[0])

    default_sampler_cls = _Uniform
@@ -337,9 +416,11 @@ class Categorical(Domain):
            domain: "Categorical",
            spec: Optional[Union[List[Dict], Dict]] = None,
            size: int = 1,
+            random_state: "RandomState" = None,
        ):
-
-            items = random.choices(domain.categories, k=size)
+            if not isinstance(random_state, _BackwardsCompatibleNumpyRng):
+                random_state = _BackwardsCompatibleNumpyRng(random_state)
+            items = random_state.choice(domain.categories, size=size).tolist()
            return items if len(items) > 1 else domain.cast(items[0])

    default_sampler_cls = _Uniform
@@ -352,6 +433,11 @@ class Categorical(Domain):
        new.set_sampler(self._Uniform())
        return new

+    def grid(self):
+        new = copy(self)
+        new.set_sampler(Grid())
+        return new
+
    def __len__(self):
        return len(self.categories)

@@ -381,8 +467,11 @@ class Quantized(Sampler):
        domain: Domain,
        spec: Optional[Union[List[Dict], Dict]] = None,
        size: int = 1,
+        random_state: "RandomState" = None,
    ):
-        values = self.sampler.sample(domain, spec, size)
+        if not isinstance(random_state, _BackwardsCompatibleNumpyRng):
+            random_state = _BackwardsCompatibleNumpyRng(random_state)
+        values = self.sampler.sample(domain, spec, size, random_state=random_state)
        quantized = np.round(np.divide(values, self.q)) * self.q
        if not isinstance(quantized, np.ndarray):
            return domain.cast(quantized)
@@ -462,10 +551,10 @@ def qloguniform(lower: float, upper: float, q: float, base: float = 10):
    return Float(lower, upper).loguniform(base).quantized(q)


-def choice(categories: List):
+def choice(categories: Sequence):
    """Sample a categorical value.
    Sampling from ``tune.choice([1, 2])`` is equivalent to sampling from
-    ``random.choice([1, 2])``
+    ``np.random.choice([1, 2])``
    """
    return Categorical(categories).uniform()

--- a/flaml/tune/space.py
+++ b/flaml/tune/space.py
@@ -7,13 +7,22 @@ try:
 except (ImportError, AssertionError):
    from . import sample
    from ..searcher.variant_generator import generate_variants
-from typing import Dict, Optional, Any, Tuple
+from typing import Dict, Optional, Any, Tuple, Generator
 import numpy as np
 import logging

 logger = logging.getLogger(__name__)


+def generate_variants_compatible(
+    unresolved_spec: Dict, constant_grid_search: bool = False, random_state=None
+) -> Generator[Tuple[Dict, Dict], None, None]:
+    try:
+        return generate_variants(unresolved_spec, constant_grid_search, random_state)
+    except TypeError:
+        return generate_variants(unresolved_spec, constant_grid_search)
+
+
 def define_by_run_func(trial, space: Dict, path: str = "") -> Optional[Dict[str, Any]]:
    """Define-by-run function to create the search space.

@@ -417,7 +426,6 @@ def indexof(domain: Dict, config: Dict) -> int:
        return index
    if config in domain.categories:
        return domain.categories.index(config)
-    # print(config)
    for i, cat in enumerate(domain.categories):
        if not isinstance(cat, dict):
            continue
@@ -491,7 +499,9 @@ def complete_config(
    for key, value in space.items():
        if key not in config:
            config[key] = value
-    for _, generated in generate_variants({"config": config}):
+    for _, generated in generate_variants_compatible(
+        {"config": config}, random_state=flow2.rs_random
+    ):
        config = generated["config"]
        break
    subspace = {}
--- a/flaml/version.py
+++ b/flaml/version.py
@@ -1 +1 @@
-__version__ = "0.9.1"
+__version__ = "0.9.4"
--- a/notebook/integrate_azureml.ipynb
+++ b/notebook/integrate_azureml.ipynb
@@ -2,6 +2,11 @@
 "cells": [
  {
   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
   "source": [
    "Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. \n",
    "\n",
@@ -22,105 +27,106 @@
    "\n",
    "In this notebook, we use one real data example (binary classification) to showcase how to use FLAML library together with AzureML.\n",
    "\n",
-    "FLAML requires `Python>=3.6`. To run this notebook example, please install flaml with the `notebook` and `azureml` option:\n",
+    "FLAML requires `Python>=3.6`. To run this notebook example, please install flaml with the [azureml] option:\n",
    "```bash\n",
-    "pip install flaml[notebook,azureml]\n",
+    "pip install flaml[azureml]\n",
    "```"
-   ],
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   }
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "source": [
-    "!pip install flaml[notebook,azureml]"
-   ],
+   "metadata": {},
   "outputs": [],
-   "metadata": {}
+   "source": [
+    "!pip install flaml[azureml]"
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "### Enable mlflow in AzureML workspace"
-   ],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "import mlflow\n",
    "from azureml.core import Workspace\n",
    "\n",
    "ws = Workspace.from_config()\n",
    "mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())"
-   ],
-   "outputs": [],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
   "source": [
    "## 2. Classification Example\n",
    "### Load data and preprocess\n",
    "\n",
    "Download [Airlines dataset](https://www.openml.org/d/1169) from OpenML. The task is to predict whether a given flight will be delayed, given the information of the scheduled departure."
-   ],
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   }
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "source": [
-    "from flaml.data import load_openml_dataset\n",
-    "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')"
-   ],
-   "outputs": [],
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    },
    "tags": []
-   }
+   },
+   "outputs": [],
+   "source": [
+    "from flaml.data import load_openml_dataset\n",
+    "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')"
+   ]
  },
  {
   "cell_type": "markdown",
-   "source": [
-    "### Run FLAML\n",
-    "In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. For example, the default ML learners of FLAML are `['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree', 'lrl1']`. "
-   ],
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
-   }
+   },
+   "source": [
+    "### Run FLAML\n",
+    "In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. For example, the default ML learners of FLAML are `['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree', 'lrl1']`. "
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "outputs": [],
   "source": [
    "''' import AutoML class from flaml package '''\n",
    "from flaml import AutoML\n",
    "automl = AutoML()"
-   ],
-   "outputs": [],
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   }
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "outputs": [],
   "source": [
    "settings = {\n",
    "    \"time_budget\": 60,  # total running time in seconds\n",
@@ -131,181 +137,77 @@
    "    \"sample\": False,  # whether to subsample training data\n",
    "    \"log_file_name\": 'airlines_experiment.log',  # flaml log file\n",
    "}"
-   ],
-   "outputs": [],
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   }
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    },
+    "tags": []
+   },
+   "outputs": [],
   "source": [
-    "mlflow.set_experiment(\"flaml\")\n",
+    "experiment = mlflow.set_experiment(\"flaml\")\n",
    "with mlflow.start_run() as run:\n",
-    "    '''The main flaml automl API'''\n",
-    "    automl.fit(X_train=X_train, y_train=y_train, **settings)"
-   ],
-   "outputs": [],
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    },
-    "tags": []
-   }
+    "    automl.fit(X_train=X_train, y_train=y_train, **settings)\n",
+    "    # log the model\n",
+    "    mlflow.sklearn.log_model(automl, \"automl\")\n"
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
-    "### Best model and metric"
-   ],
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   }
+    "### Load the model"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "source": [
-    "''' retrieve best config and best learner'''\n",
-    "print('Best ML leaner:', automl.best_estimator)\n",
-    "print('Best hyperparmeter config:', automl.best_config)\n",
-    "print('Best accuracy on validation data: {0:.4g}'.format(1 - automl.best_loss))\n",
-    "print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))"
-   ],
+   "metadata": {},
   "outputs": [],
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    },
-    "tags": []
-   }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
   "source": [
-    "automl.model"
-   ],
-   "outputs": [],
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "source": [
-    "''' pickle and save the automl object '''\n",
-    "import pickle\n",
-    "with open('automl.pkl', 'wb') as f:\n",
-    "    pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)"
-   ],
-   "outputs": [],
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "source": [
-    "''' compute predictions of testing dataset ''' \n",
-    "y_pred = automl.predict(X_test)\n",
-    "print('Predicted labels', y_pred)\n",
-    "print('True labels', y_test)\n",
-    "y_pred_proba = automl.predict_proba(X_test)[:,1]"
-   ],
-   "outputs": [],
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    },
-    "tags": []
-   }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "source": [
-    "''' compute different metric values on testing dataset'''\n",
-    "from flaml.ml import sklearn_metric_loss_score\n",
-    "print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))\n",
-    "print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))\n",
-    "print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))"
-   ],
-   "outputs": [],
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    },
-    "tags": []
-   }
+    "automl = mlflow.sklearn.load_model(f\"{run.info.artifact_uri}/automl\")\n",
+    "print(automl.predict_proba(X_test))\n",
+    "print(automl.predict(X_test))"
+   ]
  },
  {
   "cell_type": "markdown",
-   "source": [
-    "### Log history"
-   ],
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
-   }
+   },
+   "source": [
+    "### Retrieve logs"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "source": [
-    "from flaml.data import get_output_from_log\n",
-    "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n",
-    "    get_output_from_log(filename = settings['log_file_name'], time_budget = 60)\n",
-    "\n",
-    "for config in config_history:\n",
-    "    print(config)"
-   ],
-   "outputs": [],
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    },
    "tags": []
-   }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
-    "\n",
-    "plt.title('Learning Curve')\n",
-    "plt.xlabel('Wall Clock Time (s)')\n",
-    "plt.ylabel('Validation Accuracy')\n",
-    "plt.scatter(time_history, 1 - np.array(valid_loss_history))\n",
-    "plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')\n",
-    "plt.show()"
-   ],
+   },
   "outputs": [],
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   }
+   "source": [
+    "mlflow.search_runs(experiment_ids=[experiment.experiment_id], filter_string=\"params.learner = 'xgboost'\")"
+   ]
  }
 ],
 "metadata": {
+  "interpreter": {
+   "hash": "0cfea3304185a9579d09e0953576b57c8581e46e6ebc6dfeb681bc5a511f7544"
+  },
  "kernelspec": {
-   "name": "python3",
-   "display_name": "Python 3.8.0 64-bit ('blend': conda)"
+   "display_name": "Python 3.8.0 64-bit ('blend': conda)",
+   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
@@ -317,12 +219,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.0"
-  },
-  "interpreter": {
-   "hash": "0cfea3304185a9579d09e0953576b57c8581e46e6ebc6dfeb681bc5a511f7544"
+   "version": "3.9.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
-}
+}
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,5 +1,5 @@
 [pytest]
 addopts = -m "not conda"
-markers = 
+markers =
    conda: test related to conda forge distribution

--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ import os

 here = os.path.abspath(os.path.dirname(__file__))

-with open("README.md", "r") as fh:
+with open("README.md", "r", encoding="UTF-8") as fh:
    long_description = fh.read()


@@ -55,9 +55,13 @@ setuptools.setup(
            "statsmodels>=0.12.2",
            "psutil==5.8.0",
            "dataclasses",
-            "transformers",
-            "datasets==1.4.1",
+            "transformers>=4.14",
+            "datasets",
            "torch",
+            "nltk",
+            "rouge_score",
+            "hcrystalball==0.1.10",
+            "seqeval",
        ],
        "catboost": ["catboost>=0.26"],
        "blendsearch": ["optuna==2.8.0"],
@@ -74,9 +78,16 @@ setuptools.setup(
        "vw": [
            "vowpalwabbit",
        ],
-        "nlp": ["transformers", "datasets==1.4.1", "torch"],
-        "ts_forecast": ["prophet>=1.0.1", "statsmodels>=0.12.2"],
-        "forecast": ["prophet>=1.0.1", "statsmodels>=0.12.2"],
+        "nlp": [
+            "transformers>=4.14",
+            "datasets",
+            "torch",
+            "seqeval",
+            "nltk",
+            "rouge_score",
+        ],
+        "ts_forecast": ["prophet>=1.0.1", "statsmodels>=0.12.2", "hcrystalball==0.1.10"],
+        "forecast": ["prophet>=1.0.1", "statsmodels>=0.12.2", "hcrystalball==0.1.10"],
        "benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3"],
    },
    classifiers=[
--- a/test/automl/test_classification.py
+++ b/test/automl/test_classification.py
@@ -2,6 +2,7 @@ import unittest
 import numpy as np
 import scipy.sparse
 from sklearn.datasets import load_breast_cancer
+from sklearn.model_selection import train_test_split
 import pandas as pd
 from datetime import datetime
 from flaml import AutoML
@@ -98,6 +99,7 @@ class TestClassification(unittest.TestCase):
            "ensemble": True,
        }
        automl.fit(X, y, **automl_settings)
+        assert automl.model is not None

        automl = AutoML()
        automl_settings = {
@@ -221,14 +223,28 @@ class TestClassification(unittest.TestCase):
        print(automl_experiment.best_estimator)

    def test_ray_classification(self):
-        from sklearn.datasets import make_classification
+        X, y = load_breast_cancer(return_X_y=True)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

-        X, y = make_classification(1000, 10)
        automl = AutoML()
        try:
-            automl.fit(X, y, time_budget=10, task="classification", use_ray=True)
            automl.fit(
-                X, y, time_budget=10, task="classification", n_concurrent_trials=2
+                X_train,
+                y_train,
+                X_val=X_test,
+                y_val=y_test,
+                time_budget=10,
+                task="classification",
+                use_ray=True,
+            )
+            automl.fit(
+                X_train,
+                y_train,
+                X_val=X_test,
+                y_val=y_test,
+                time_budget=10,
+                task="classification",
+                n_concurrent_trials=2,
            )
        except ImportError:
            return
--- a/test/automl/test_forecast.py
+++ b/test/automl/test_forecast.py
@@ -105,6 +105,7 @@ def test_numpy():
            task="ts_forecast",
            time_budget=3,  # time budget in seconds
            log_file_name="test/ts_forecast.log",
+            n_splits=3,  # number of splits
        )
        print(automl.predict(X_train[72:]))
    except ImportError:
@@ -280,7 +281,6 @@ def load_multi_dataset_cat(time_horizon):
 def test_multivariate_forecast_cat(budget=5):
    time_horizon = 180
    train_df, test_df = load_multi_dataset_cat(time_horizon)
-    print(train_df)
    X_test = test_df[
        ["timeStamp", "season", "above_monthly_avg"]
    ]  # test dataframe must contain values for the regressors / multivariate variables
@@ -290,7 +290,7 @@ def test_multivariate_forecast_cat(budget=5):
        "time_budget": budget,  # total running time in seconds
        "metric": "mape",  # primary metric
        "task": "ts_forecast",  # task type
-        "log_file_name": "test/energy_forecast_numerical.log",  # flaml log file
+        "log_file_name": "test/energy_forecast_categorical.log",  # flaml log file
        "eval_method": "holdout",
        "log_type": "all",
        "label": "demand",
@@ -360,3 +360,4 @@ if __name__ == "__main__":
    test_forecast_automl(60)
    test_multivariate_forecast_num(60)
    test_multivariate_forecast_cat(60)
+    test_numpy()
--- a/test/automl/test_notebook_example.py
+++ b/test/automl/test_notebook_example.py
@@ -101,13 +101,25 @@ def test_mlflow():
        "log_file_name": "adult.log",  # flaml log file
    }
    mlflow.set_experiment("flaml")
-    with mlflow.start_run():
-        """The main flaml automl API"""
+    with mlflow.start_run() as run:
        automl.fit(X_train=X_train, y_train=y_train, **settings)
-    # subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "mlflow"])
+        mlflow.sklearn.log_model(automl, "automl")
+    loaded_model = mlflow.pyfunc.load_model(f"{run.info.artifact_uri}/automl")
+    print(loaded_model.predict(X_test))
    automl._mem_thres = 0
    print(automl.trainable(automl.points_to_evaluate[0]))

+    settings["use_ray"] = True
+    try:
+        with mlflow.start_run() as run:
+            automl.fit(X_train=X_train, y_train=y_train, **settings)
+            mlflow.sklearn.log_model(automl, "automl")
+        automl = mlflow.sklearn.load_model(f"{run.info.artifact_uri}/automl")
+        print(automl.predict_proba(X_test))
+    except ImportError:
+        pass
+    # subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "mlflow"])
+

 if __name__ == "__main__":
    test_automl(120)
--- a/test/automl/test_regression.py
+++ b/test/automl/test_regression.py
@@ -214,15 +214,14 @@ def test_multioutput():

    # predict
    print(model.predict(X_test))
-    
-    #train the model
+
+    # train the model
    model = RegressorChain(AutoML(task="regression", time_budget=1))
    model.fit(X_train, y_train)
-    
+
    # predict
    print(model.predict(X_test))

- 

 if __name__ == "__main__":
    unittest.main()
--- a/test/automl/test_warmstart.py
+++ b/test/automl/test_warmstart.py
@@ -38,7 +38,7 @@ class TestWarmStart(unittest.TestCase):
        starting_points = automl_experiment.best_config_per_estimator
        print("starting_points", starting_points)
        print("loss of the starting_points", automl_experiment.best_loss_per_estimator)
-        starting_point = starting_points['lgbm']
+        starting_point = starting_points["lgbm"]
        hps_to_freeze = ["colsample_bytree", "reg_alpha", "reg_lambda", "log_max_bin"]

        # 2. Constrct a new class:
@@ -55,17 +55,13 @@ class TestWarmStart(unittest.TestCase):
                    # if an hp is specifed to be freezed, use tine value provided in the starting_point
                    # otherwise use the setting from the original search space
                    if hp_name in starting_point:
-                        space[hp_name] = {
-                            "domain": starting_point[hp_name]
-                        }
+                        space[hp_name] = {"domain": starting_point[hp_name]}
                # (3.1) Configure the search space for hps that are in the original search space
                #  but you want to change something, for example the range.
                revised_hps_to_search = {
                    "n_estimators": {
                        "domain": tune.lograndint(lower=10, upper=32768),
-                        "init_value": starting_point.get(
-                            "n_estimators"
-                        )
+                        "init_value": starting_point.get("n_estimators")
                        or space["n_estimators"].get("init_value", 10),
                        "low_cost_init_value": space["n_estimators"].get(
                            "low_cost_init_value", 10
@@ -73,9 +69,7 @@ class TestWarmStart(unittest.TestCase):
                    },
                    "num_leaves": {
                        "domain": tune.lograndint(lower=10, upper=3276),
-                        "init_value": starting_point.get(
-                            "num_leaves"
-                        )
+                        "init_value": starting_point.get("num_leaves")
                        or space["num_leaves"].get("init_value", 10),
                        "low_cost_init_value": space["num_leaves"].get(
                            "low_cost_init_value", 10
@@ -95,7 +89,7 @@ class TestWarmStart(unittest.TestCase):
        new_automl_experiment.add_learner(
            learner_name=new_estimator_name, learner_class=MyPartiallyFreezedLargeLGBM
        )
-        
+
        automl_settings_resume = {
            "time_budget": 3,
            "metric": "accuracy",
--- a/test/nlp/test_autohf.py
+++ b/test/nlp/test_autohf.py
@@ -1,5 +1,7 @@
 import sys
 import pytest
+import pickle
+import shutil


@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
@@ -53,6 +55,7 @@ def test_hf_data():
    automl.fit(
        X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
    )
+
    automl = AutoML()
    automl.retrain_from_log(
        X_train=X_train,
@@ -61,7 +64,11 @@ def test_hf_data():
        record_id=0,
        **automl_settings
    )
-
+    with open("automl.pkl", "wb") as f:
+        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
+    with open("automl.pkl", "rb") as f:
+        automl = pickle.load(f)
+    shutil.rmtree("test/data/output/")
    automl.predict(X_test)
    automl.predict(["test test", "test test"])
    automl.predict(
@@ -103,8 +110,8 @@ def _test_custom_data():

    automl_settings = {
        "gpu_per_trial": 0,
-        "max_iter": 10,
-        "time_budget": 300,
+        "max_iter": 3,
+        "time_budget": 5,
        "task": "seq-classification",
        "metric": "accuracy",
    }
--- a/test/nlp/test_autohf_custom_metric.py
+++ b/test/nlp/test_autohf_custom_metric.py
@@ -0,0 +1,100 @@
+import sys
+import pytest
+
+
+def custom_metric(
+    X_test,
+    y_test,
+    estimator,
+    labels,
+    X_train,
+    y_train,
+    weight_test=None,
+    weight_train=None,
+    config=None,
+    groups_test=None,
+    groups_train=None,
+):
+    from datasets import Dataset
+    from flaml.model import TransformersEstimator
+
+    if estimator._trainer is None:
+        estimator._init_model_for_predict(X_test)
+        trainer = estimator._trainer
+        estimator._trainer = None
+    else:
+        trainer = estimator._trainer
+    if y_test is not None:
+        X_test, _ = estimator._preprocess(X_test)
+        eval_dataset = Dataset.from_pandas(TransformersEstimator._join(X_test, y_test))
+    else:
+        X_test, _ = estimator._preprocess(X_test)
+        eval_dataset = Dataset.from_pandas(X_test)
+
+    trainer_compute_metrics_cache = trainer.compute_metrics
+    trainer.compute_metrics = None
+
+    metrics = trainer.evaluate(eval_dataset)
+    trainer.compute_metrics = trainer_compute_metrics_cache
+    return metrics["eval_loss"], metrics
+
+
+@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
+def test_custom_metric():
+    from flaml import AutoML
+    import requests
+    from datasets import load_dataset
+
+    try:
+        train_dataset = (
+            load_dataset("glue", "mrpc", split="train").to_pandas().iloc[0:4]
+        )
+        dev_dataset = load_dataset("glue", "mrpc", split="train").to_pandas().iloc[0:4]
+    except requests.exceptions.ConnectionError:
+        return
+
+    custom_sent_keys = ["sentence1", "sentence2"]
+    label_key = "label"
+
+    X_train = train_dataset[custom_sent_keys]
+    y_train = train_dataset[label_key]
+
+    X_val = dev_dataset[custom_sent_keys]
+    y_val = dev_dataset[label_key]
+
+    automl = AutoML()
+
+    # testing when max_iter=1 and do retrain only without hpo
+
+    automl_settings = {
+        "gpu_per_trial": 0,
+        "max_iter": 1,
+        "time_budget": 5,
+        "task": "seq-classification",
+        "metric": custom_metric,
+        "log_file_name": "seqclass.log",
+    }
+
+    automl_settings["custom_hpo_args"] = {
+        "model_path": "google/electra-small-discriminator",
+        "output_dir": "data/output/",
+        "ckpt_per_epoch": 5,
+        "fp16": False,
+    }
+
+    automl.fit(
+        X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
+    )
+
+    # testing calling custom metric in TransformersEstimator._compute_metrics_by_dataset_name
+
+    automl_settings["max_iter"] = 3
+    automl.fit(
+        X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
+    )
+
+    del automl
+
+
+if __name__ == "__main__":
+    test_custom_metric()
--- a/test/nlp/test_autohf_cv.py
+++ b/test/nlp/test_autohf_cv.py
@@ -40,3 +40,7 @@ def test_cv():
    }

    automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
+
+
+if __name__ == "__main__":
+    test_cv()
--- a/test/nlp/test_autohf_maxiter1.py
+++ b/test/nlp/test_autohf_maxiter1.py
@@ -1,68 +0,0 @@
-import sys
-import pytest
-
-
-@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
-def test_max_iter_1():
-    from flaml import AutoML
-    import requests
-    from datasets import load_dataset
-
-    try:
-        train_dataset = (
-            load_dataset("glue", "mrpc", split="train").to_pandas().iloc[0:4]
-        )
-        dev_dataset = load_dataset("glue", "mrpc", split="train").to_pandas().iloc[0:4]
-    except requests.exceptions.ConnectionError:
-        return
-
-    custom_sent_keys = ["sentence1", "sentence2"]
-    label_key = "label"
-
-    X_train = train_dataset[custom_sent_keys]
-    y_train = train_dataset[label_key]
-
-    X_val = dev_dataset[custom_sent_keys]
-    y_val = dev_dataset[label_key]
-
-    automl = AutoML()
-
-    def toy_metric(
-        X_test,
-        y_test,
-        estimator,
-        labels,
-        X_train,
-        y_train,
-        weight_test=None,
-        weight_train=None,
-        config=None,
-        groups_test=None,
-        groups_train=None,
-    ):
-        return 0, {
-            "test_loss": 0,
-            "train_loss": 0,
-            "pred_time": 0,
-        }
-
-    automl_settings = {
-        "gpu_per_trial": 0,
-        "max_iter": 1,
-        "time_budget": 5,
-        "task": "seq-classification",
-        "metric": toy_metric,
-        "log_file_name": "seqclass.log",
-    }
-
-    automl_settings["custom_hpo_args"] = {
-        "model_path": "google/electra-small-discriminator",
-        "output_dir": "data/output/",
-        "ckpt_per_epoch": 5,
-        "fp16": False,
-    }
-
-    automl.fit(
-        X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
-    )
-    del automl
--- a/test/nlp/test_autohf_multichoice_classification.py
+++ b/test/nlp/test_autohf_multichoice_classification.py
@@ -0,0 +1,245 @@
+import sys
+import pytest
+
+
+@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
+def test_mcc():
+    from flaml import AutoML
+
+    import pandas as pd
+
+    train_data = {
+        "video-id": [
+            "anetv_fruimvo90vA",
+            "anetv_fruimvo90vA",
+            "anetv_fruimvo90vA",
+            "anetv_MldEr60j33M",
+            "lsmdc0049_Hannah_and_her_sisters-69438",
+        ],
+        "fold-ind": ["10030", "10030", "10030", "5488", "17405"],
+        "startphrase": [
+            "A woman is seen running down a long track and jumping into a pit. The camera",
+            "A woman is seen running down a long track and jumping into a pit. The camera",
+            "A woman is seen running down a long track and jumping into a pit. The camera",
+            "A man in a white shirt bends over and picks up a large weight. He",
+            "Someone furiously shakes someone away. He",
+        ],
+        "sent1": [
+            "A woman is seen running down a long track and jumping into a pit.",
+            "A woman is seen running down a long track and jumping into a pit.",
+            "A woman is seen running down a long track and jumping into a pit.",
+            "A man in a white shirt bends over and picks up a large weight.",
+            "Someone furiously shakes someone away.",
+        ],
+        "sent2": ["The camera", "The camera", "The camera", "He", "He"],
+        "gold-source": ["gen", "gen", "gold", "gen", "gold"],
+        "ending0": [
+            "captures her as well as lifting weights down in place.",
+            "follows her spinning her body around and ends by walking down a lane.",
+            "watches her as she walks away and sticks her tongue out to another person.",
+            "lifts the weights over his head.",
+            "runs to a woman standing waiting.",
+        ],
+        "ending1": [
+            "pans up to show another woman running down the track.",
+            "pans around the two.",
+            "captures her as well as lifting weights down in place.",
+            "also lifts it onto his chest before hanging it back out again.",
+            "tackles him into the passenger seat.",
+        ],
+        "ending2": [
+            "follows her movements as the group members follow her instructions.",
+            "captures her as well as lifting weights down in place.",
+            "follows her spinning her body around and ends by walking down a lane.",
+            "spins around and lifts a barbell onto the floor.",
+            "pounds his fist against a cupboard.",
+        ],
+        "ending3": [
+            "follows her spinning her body around and ends by walking down a lane.",
+            "follows her movements as the group members follow her instructions.",
+            "pans around the two.",
+            "bends down and lifts the weight over his head.",
+            "offers someone the cup on his elbow and strides out.",
+        ],
+        "label": [1, 3, 0, 0, 2],
+    }
+    dev_data = {
+        "video-id": [
+            "lsmdc3001_21_JUMP_STREET-422",
+            "lsmdc0001_American_Beauty-45991",
+            "lsmdc0001_American_Beauty-45991",
+            "lsmdc0001_American_Beauty-45991",
+        ],
+        "fold-ind": ["11783", "10977", "10970", "10968"],
+        "startphrase": [
+            "Firing wildly he shoots holes through the tanker. He",
+            "He puts his spatula down. The Mercedes",
+            "He stands and looks around, his eyes finally landing on: "
+            "The digicam and a stack of cassettes on a shelf. Someone",
+            "He starts going through someone's bureau. He opens the drawer "
+            "in which we know someone keeps his marijuana, but he",
+        ],
+        "sent1": [
+            "Firing wildly he shoots holes through the tanker.",
+            "He puts his spatula down.",
+            "He stands and looks around, his eyes finally landing on: "
+            "The digicam and a stack of cassettes on a shelf.",
+            "He starts going through someone's bureau.",
+        ],
+        "sent2": [
+            "He",
+            "The Mercedes",
+            "Someone",
+            "He opens the drawer in which we know someone keeps his marijuana, but he",
+        ],
+        "gold-source": ["gold", "gold", "gold", "gold"],
+        "ending0": [
+            "overtakes the rig and falls off his bike.",
+            "fly open and drinks.",
+            "looks at someone's papers.",
+            "stops one down and rubs a piece of the gift out.",
+        ],
+        "ending1": [
+            "squeezes relentlessly on the peanut jelly as well.",
+            "walks off followed driveway again.",
+            "feels around it and falls in the seat once more.",
+            "cuts the mangled parts.",
+        ],
+        "ending2": [
+            "scrambles behind himself and comes in other directions.",
+            "slots them into a separate green.",
+            "sprints back from the wreck and drops onto his back.",
+            "hides it under his hat to watch.",
+        ],
+        "ending3": [
+            "sweeps a explodes and knocks someone off.",
+            "pulls around to the drive - thru window.",
+            "sits at the kitchen table, staring off into space.",
+            "does n't discover its false bottom.",
+        ],
+        "label": [0, 3, 3, 3],
+    }
+    test_data = {
+        "video-id": [
+            "lsmdc0001_American_Beauty-45991",
+            "lsmdc0001_American_Beauty-45991",
+            "lsmdc0001_American_Beauty-45991",
+            "lsmdc0001_American_Beauty-45991",
+        ],
+        "fold-ind": ["10980", "10976", "10978", "10969"],
+        "startphrase": [
+            "Someone leans out of the drive - thru window, "
+            "grinning at her, holding bags filled with fast food. The Counter Girl",
+            "Someone looks up suddenly when he hears. He",
+            "Someone drives; someone sits beside her. They",
+            "He opens the drawer in which we know someone "
+            "keeps his marijuana, but he does n't discover"
+            " its false bottom. He stands and looks around, his eyes",
+        ],
+        "sent1": [
+            "Someone leans out of the drive - thru "
+            "window, grinning at her, holding bags filled with fast food.",
+            "Someone looks up suddenly when he hears.",
+            "Someone drives; someone sits beside her.",
+            "He opens the drawer in which we know"
+            " someone keeps his marijuana, but he does n't discover its false bottom.",
+        ],
+        "sent2": [
+            "The Counter Girl",
+            "He",
+            "They",
+            "He stands and looks around, his eyes",
+        ],
+        "gold-source": ["gold", "gold", "gold", "gold"],
+        "ending0": [
+            "stands next to him, staring blankly.",
+            "puts his spatula down.",
+            "rise someone's feet up.",
+            "moving to the side, the houses rapidly stained.",
+        ],
+        "ending1": [
+            "with auditorium, filmed, singers the club.",
+            "bumps into a revolver and drops surreptitiously into his weapon.",
+            "lift her and they are alarmed.",
+            "focused as the sight of someone making his way down a trail.",
+        ],
+        "ending2": [
+            "attempts to block her ransacked.",
+            "talks using the phone and walks away for a few seconds.",
+            "are too involved with each other to "
+            "notice someone watching them from the drive - thru window.",
+            "finally landing on: the digicam and a stack of cassettes on a shelf.",
+        ],
+        "ending3": [
+            "is eating solid and stinky.",
+            "bundles the flaxen powder beneath the car.",
+            "sit at a table with a beer from a table.",
+            "deep and continuing, its bleed - length sideburns pressing on him.",
+        ],
+        "label": [0, 0, 2, 2],
+    }
+
+    train_dataset = pd.DataFrame(train_data)
+    dev_dataset = pd.DataFrame(dev_data)
+    test_dataset = pd.DataFrame(test_data)
+
+    custom_sent_keys = [
+        "sent1",
+        "sent2",
+        "ending0",
+        "ending1",
+        "ending2",
+        "ending3",
+        "gold-source",
+        "video-id",
+        "startphrase",
+        "fold-ind",
+    ]
+    label_key = "label"
+
+    X_train = train_dataset[custom_sent_keys]
+    y_train = train_dataset[label_key]
+
+    X_val = dev_dataset[custom_sent_keys]
+    y_val = dev_dataset[label_key]
+
+    X_test = test_dataset[custom_sent_keys]
+    X_true = test_dataset[label_key]
+    automl = AutoML()
+
+    automl_settings = {
+        "gpu_per_trial": 0,
+        "max_iter": 2,
+        "time_budget": 5,
+        "task": "multichoice-classification",
+        "metric": "accuracy",
+        "log_file_name": "seqclass.log",
+    }
+
+    automl_settings["custom_hpo_args"] = {
+        "model_path": "google/electra-small-discriminator",
+        "output_dir": "test/data/output/",
+        "ckpt_per_epoch": 5,
+        "fp16": False,
+    }
+
+    automl.fit(
+        X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
+    )
+
+    y_pred = automl.predict(X_test)
+    proba = automl.predict_proba(X_test)
+    print(str(len(automl.classes_)) + " classes")
+    print(y_pred)
+    print(X_true)
+    print(proba)
+    true_count = 0
+    for i, v in X_true.items():
+        if y_pred[i] == v:
+            true_count += 1
+    accuracy = round(true_count / len(y_pred), 5)
+    print("Accuracy: " + str(accuracy))
+
+
+if __name__ == "__main__":
+    test_mcc()
--- a/test/nlp/test_autohf_summarization.py
+++ b/test/nlp/test_autohf_summarization.py
@@ -0,0 +1,82 @@
+import sys
+import pytest
+
+
+@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
+def test_summarization():
+    from flaml import AutoML
+    from pandas import DataFrame
+
+    train_dataset = DataFrame(
+        [
+            ("The cat is alive", "The cat is dead"),
+            ("The cat is alive", "The cat is dead"),
+            ("The cat is alive", "The cat is dead"),
+            ("The cat is alive", "The cat is dead"),
+        ]
+    )
+    dev_dataset = DataFrame(
+        [
+            ("The old woman is beautiful", "The old woman is ugly"),
+            ("The old woman is beautiful", "The old woman is ugly"),
+            ("The old woman is beautiful", "The old woman is ugly"),
+            ("The old woman is beautiful", "The old woman is ugly"),
+        ]
+    )
+    test_dataset = DataFrame(
+        [
+            ("The purse is cheap", "The purse is expensive"),
+            ("The purse is cheap", "The purse is expensive"),
+            ("The purse is cheap", "The purse is expensive"),
+            ("The purse is cheap", "The purse is expensive"),
+        ]
+    )
+
+    for each_dataset in [train_dataset, dev_dataset, test_dataset]:
+        each_dataset.columns = ["document", "summary"]
+
+    custom_sent_keys = ["document"]
+    label_key = "summary"
+
+    X_train = train_dataset[custom_sent_keys]
+    y_train = train_dataset[label_key]
+
+    X_val = dev_dataset[custom_sent_keys]
+    y_val = dev_dataset[label_key]
+
+    X_test = test_dataset[custom_sent_keys]
+
+    automl = AutoML()
+
+    automl_settings = {
+        "gpu_per_trial": 0,
+        "max_iter": 3,
+        "time_budget": 20,
+        "task": "summarization",
+        "metric": "rouge1",
+        "log_file_name": "seqclass.log",
+    }
+
+    automl_settings["custom_hpo_args"] = {
+        "model_path": "patrickvonplaten/t5-tiny-random",
+        "output_dir": "test/data/output/",
+        "ckpt_per_epoch": 5,
+        "fp16": False,
+    }
+
+    automl.fit(
+        X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
+    )
+    automl = AutoML()
+    automl.retrain_from_log(
+        X_train=X_train,
+        y_train=y_train,
+        train_full=True,
+        record_id=0,
+        **automl_settings
+    )
+    automl.predict(X_test)
+
+
+if __name__ == "__main__":
+    test_summarization()
--- a/test/nlp/test_autohf_tokenclassification.py
+++ b/test/nlp/test_autohf_tokenclassification.py
@@ -0,0 +1,741 @@
+import sys
+import pytest
+
+
+@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
+def test_tokenclassification():
+    from flaml import AutoML
+    import pandas as pd
+
+    train_data = {
+        "chunk_tags": [
+            [11, 21, 11, 12, 21, 22, 11, 12, 0],
+            [11, 12],
+            [11, 12],
+            [
+                11,
+                12,
+                12,
+                21,
+                13,
+                11,
+                11,
+                21,
+                13,
+                11,
+                12,
+                13,
+                11,
+                21,
+                22,
+                11,
+                12,
+                17,
+                11,
+                21,
+                17,
+                11,
+                12,
+                12,
+                21,
+                22,
+                22,
+                13,
+                11,
+                0,
+            ],
+        ],
+        "id": ["0", "1", "2", "3"],
+        "ner_tags": [
+            [3, 0, 7, 0, 0, 0, 7, 0, 0],
+            [1, 2],
+            [5, 0],
+            [
+                0,
+                3,
+                4,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                7,
+                0,
+                0,
+                0,
+                0,
+                0,
+                7,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            ],
+        ],
+        "pos_tags": [
+            [22, 42, 16, 21, 35, 37, 16, 21, 7],
+            [22, 22],
+            [22, 11],
+            [
+                12,
+                22,
+                22,
+                38,
+                15,
+                22,
+                28,
+                38,
+                15,
+                16,
+                21,
+                35,
+                24,
+                35,
+                37,
+                16,
+                21,
+                15,
+                24,
+                41,
+                15,
+                16,
+                21,
+                21,
+                20,
+                37,
+                40,
+                35,
+                21,
+                7,
+            ],
+        ],
+        "tokens": [
+            [
+                "EU",
+                "rejects",
+                "German",
+                "call",
+                "to",
+                "boycott",
+                "British",
+                "lamb",
+                ".",
+            ],
+            ["Peter", "Blackburn"],
+            ["BRUSSELS", "1996-08-22"],
+            [
+                "The",
+                "European",
+                "Commission",
+                "said",
+                "on",
+                "Thursday",
+                "it",
+                "disagreed",
+                "with",
+                "German",
+                "advice",
+                "to",
+                "consumers",
+                "to",
+                "shun",
+                "British",
+                "lamb",
+                "until",
+                "scientists",
+                "determine",
+                "whether",
+                "mad",
+                "cow",
+                "disease",
+                "can",
+                "be",
+                "transmitted",
+                "to",
+                "sheep",
+                ".",
+            ],
+        ],
+    }
+
+    dev_data = {
+        "chunk_tags": [
+            [
+                11,
+                11,
+                12,
+                13,
+                11,
+                12,
+                12,
+                11,
+                12,
+                12,
+                12,
+                12,
+                21,
+                13,
+                11,
+                12,
+                21,
+                22,
+                11,
+                13,
+                11,
+                1,
+                13,
+                11,
+                17,
+                11,
+                12,
+                12,
+                21,
+                1,
+                0,
+            ],
+            [
+                0,
+                11,
+                21,
+                22,
+                22,
+                11,
+                12,
+                12,
+                17,
+                11,
+                21,
+                22,
+                22,
+                11,
+                12,
+                13,
+                11,
+                0,
+                0,
+                11,
+                12,
+                11,
+                12,
+                12,
+                12,
+                12,
+                12,
+                12,
+                21,
+                11,
+                12,
+                12,
+                0,
+            ],
+            [
+                11,
+                21,
+                11,
+                12,
+                12,
+                21,
+                22,
+                0,
+                17,
+                11,
+                21,
+                22,
+                17,
+                11,
+                21,
+                22,
+                11,
+                21,
+                22,
+                22,
+                13,
+                11,
+                12,
+                12,
+                0,
+            ],
+            [
+                11,
+                21,
+                11,
+                12,
+                11,
+                12,
+                13,
+                11,
+                12,
+                12,
+                12,
+                12,
+                21,
+                22,
+                11,
+                12,
+                0,
+                11,
+                0,
+                11,
+                12,
+                13,
+                11,
+                12,
+                12,
+                12,
+                12,
+                12,
+                21,
+                11,
+                12,
+                1,
+                2,
+                2,
+                11,
+                21,
+                22,
+                11,
+                12,
+                0,
+            ],
+        ],
+        "id": ["4", "5", "6", "7"],
+        "ner_tags": [
+            [
+                5,
+                0,
+                0,
+                0,
+                0,
+                3,
+                4,
+                0,
+                0,
+                0,
+                1,
+                2,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                5,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            ],
+            [
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                3,
+                0,
+                0,
+                0,
+                1,
+                2,
+                2,
+                2,
+                0,
+                0,
+                0,
+                0,
+                0,
+            ],
+            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0],
+            [
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                3,
+                0,
+                0,
+                1,
+                2,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            ],
+        ],
+        "pos_tags": [
+            [
+                22,
+                27,
+                21,
+                35,
+                12,
+                22,
+                22,
+                27,
+                16,
+                21,
+                22,
+                22,
+                38,
+                15,
+                22,
+                24,
+                20,
+                37,
+                21,
+                15,
+                24,
+                16,
+                15,
+                22,
+                15,
+                12,
+                16,
+                21,
+                38,
+                17,
+                7,
+            ],
+            [
+                0,
+                28,
+                41,
+                30,
+                37,
+                12,
+                16,
+                21,
+                15,
+                28,
+                41,
+                30,
+                37,
+                12,
+                24,
+                15,
+                28,
+                6,
+                0,
+                12,
+                22,
+                27,
+                16,
+                21,
+                22,
+                22,
+                14,
+                22,
+                38,
+                12,
+                21,
+                21,
+                7,
+            ],
+            [
+                28,
+                38,
+                16,
+                16,
+                21,
+                38,
+                40,
+                10,
+                15,
+                28,
+                38,
+                40,
+                15,
+                21,
+                38,
+                40,
+                28,
+                20,
+                37,
+                40,
+                15,
+                12,
+                22,
+                22,
+                7,
+            ],
+            [
+                28,
+                38,
+                12,
+                21,
+                16,
+                21,
+                15,
+                22,
+                22,
+                22,
+                22,
+                22,
+                35,
+                37,
+                21,
+                24,
+                6,
+                24,
+                10,
+                16,
+                24,
+                15,
+                12,
+                21,
+                10,
+                21,
+                21,
+                24,
+                38,
+                12,
+                30,
+                16,
+                10,
+                16,
+                21,
+                35,
+                37,
+                16,
+                21,
+                7,
+            ],
+        ],
+        "tokens": [
+            [
+                "Germany",
+                "'s",
+                "representative",
+                "to",
+                "the",
+                "European",
+                "Union",
+                "'s",
+                "veterinary",
+                "committee",
+                "Werner",
+                "Zwingmann",
+                "said",
+                "on",
+                "Wednesday",
+                "consumers",
+                "should",
+                "buy",
+                "sheepmeat",
+                "from",
+                "countries",
+                "other",
+                "than",
+                "Britain",
+                "until",
+                "the",
+                "scientific",
+                "advice",
+                "was",
+                "clearer",
+                ".",
+            ],
+            [
+                '"',
+                "We",
+                "do",
+                "n't",
+                "support",
+                "any",
+                "such",
+                "recommendation",
+                "because",
+                "we",
+                "do",
+                "n't",
+                "see",
+                "any",
+                "grounds",
+                "for",
+                "it",
+                ",",
+                '"',
+                "the",
+                "Commission",
+                "'s",
+                "chief",
+                "spokesman",
+                "Nikolaus",
+                "van",
+                "der",
+                "Pas",
+                "told",
+                "a",
+                "news",
+                "briefing",
+                ".",
+            ],
+            [
+                "He",
+                "said",
+                "further",
+                "scientific",
+                "study",
+                "was",
+                "required",
+                "and",
+                "if",
+                "it",
+                "was",
+                "found",
+                "that",
+                "action",
+                "was",
+                "needed",
+                "it",
+                "should",
+                "be",
+                "taken",
+                "by",
+                "the",
+                "European",
+                "Union",
+                ".",
+            ],
+            [
+                "He",
+                "said",
+                "a",
+                "proposal",
+                "last",
+                "month",
+                "by",
+                "EU",
+                "Farm",
+                "Commissioner",
+                "Franz",
+                "Fischler",
+                "to",
+                "ban",
+                "sheep",
+                "brains",
+                ",",
+                "spleens",
+                "and",
+                "spinal",
+                "cords",
+                "from",
+                "the",
+                "human",
+                "and",
+                "animal",
+                "food",
+                "chains",
+                "was",
+                "a",
+                "highly",
+                "specific",
+                "and",
+                "precautionary",
+                "move",
+                "to",
+                "protect",
+                "human",
+                "health",
+                ".",
+            ],
+        ],
+    }
+
+    train_dataset = pd.DataFrame(train_data)
+    dev_dataset = pd.DataFrame(dev_data)
+
+    custom_sent_keys = ["tokens"]
+    label_key = "ner_tags"
+
+    X_train = train_dataset[custom_sent_keys]
+    y_train = train_dataset[label_key]
+
+    X_val = dev_dataset[custom_sent_keys]
+    y_val = dev_dataset[label_key]
+
+    automl = AutoML()
+
+    automl_settings = {
+        "gpu_per_trial": 0,
+        "max_iter": 2,
+        "time_budget": 5,
+        "task": "token-classification",
+        "metric": "seqeval",
+    }
+
+    automl_settings["custom_hpo_args"] = {
+        "model_path": "bert-base-uncased",
+        "output_dir": "test/data/output/",
+        "ckpt_per_epoch": 5,
+        "fp16": False,
+    }
+
+    automl.fit(
+        X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
+    )
+
+
+if __name__ == "__main__":
+    test_tokenclassification()
--- a/test/ray/distribute_automl.py
+++ b/test/ray/distribute_automl.py
--- a/test/ray/distribute_tune.py
+++ b/test/ray/distribute_tune.py
@@ -0,0 +1,46 @@
+import ray
+import lightgbm as lgb
+import numpy as np
+from sklearn.datasets import load_breast_cancer
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import train_test_split
+from flaml import tune
+from flaml.model import LGBMEstimator
+
+X, y = load_breast_cancer(return_X_y=True)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
+
+
+def train_breast_cancer(config):
+    params = LGBMEstimator(**config).params
+    train_set = lgb.Dataset(X_train, label=y_train)
+    gbm = lgb.train(params, train_set)
+    preds = gbm.predict(X_test)
+    pred_labels = np.rint(preds)
+    tune.report(mean_accuracy=accuracy_score(y_test, pred_labels), done=True)
+
+
+if __name__ == "__main__":
+    ray.init(address="auto")
+    flaml_lgbm_search_space = LGBMEstimator.search_space(X_train.shape)
+    config_search_space = {
+        hp: space["domain"] for hp, space in flaml_lgbm_search_space.items()
+    }
+    low_cost_partial_config = {
+        hp: space["low_cost_init_value"]
+        for hp, space in flaml_lgbm_search_space.items()
+        if "low_cost_init_value" in space
+    }
+
+    analysis = tune.run(
+        train_breast_cancer,
+        metric="mean_accuracy",
+        mode="max",
+        config=config_search_space,
+        num_samples=-1,
+        time_budget_s=60,
+        use_ray=True,
+    )
+
+    # print("Best hyperparameters found were: ", analysis.best_config)
+    print("The best trial's result: ", analysis.best_trial.last_result)
--- a/test/rep.py
+++ b/test/rep.py
@@ -0,0 +1,36 @@
+from flaml.data import load_openml_dataset
+from flaml.ml import ExtraTreesEstimator
+from flaml import AutoML
+
+X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir="./")
+X_train = X_train.iloc[:1000]
+y_train = y_train.iloc[:1000]
+
+
+class ExtraTreesEstimatorSeeded(ExtraTreesEstimator):
+    """ExtraTreesEstimator for reproducible FLAML run."""
+
+    def config2params(self, config: dict) -> dict:
+        params = super().config2params(config)
+        params["random_state"] = 0
+        return params
+
+
+settings = {
+    "time_budget": 1e10,  # total running time in seconds
+    "max_iter": 3,
+    "metric": "ap",  # average_precision
+    "task": "classification",  # task type
+    "seed": 7654321,  # random seed
+    "estimator_list": ["extra_trees_seeded"],
+    "verbose": False,
+}
+
+for trial_num in range(8):
+    automl = AutoML()
+    automl.add_learner(
+        learner_name="extra_trees_seeded", learner_class=ExtraTreesEstimatorSeeded
+    )
+    automl.fit(X_train=X_train, y_train=y_train, **settings)
+    print(automl.best_loss)
+    print(automl.best_config)
--- a/test/run_electra.py
+++ b/test/run_electra.py
@@ -1,19 +1,21 @@
 from azureml.core import Workspace, Experiment, ScriptRunConfig
+
 ws = Workspace.from_config()

-compute_target = ws.compute_targets['V100-4']
+compute_target = ws.compute_targets["V100-4"]
 # compute_target = ws.compute_targets['K80']
 command = [
    "pip install torch transformers datasets flaml[blendsearch,ray] && ",
-    "python test_electra.py"]
+    "python test_electra.py",
+]

 config = ScriptRunConfig(
-    source_directory='hf/',
+    source_directory="hf/",
    command=command,
    compute_target=compute_target,
 )

-exp = Experiment(ws, 'test-electra')
+exp = Experiment(ws, "test-electra")
 run = exp.submit(config)
 print(run.get_portal_url())  # link to ml.azure.com
 run.wait_for_completion(show_output=True)
--- a/test/test_conda_distribution.py
+++ b/test/test_conda_distribution.py
@@ -11,8 +11,8 @@ def test_package_minimum():
    # Specify automl goal and constraint
    automl_settings = {
        "time_budget": 10,  # in seconds
-        "metric": 'accuracy',
-        "task": 'classification',
+        "metric": "accuracy",
+        "task": "classification",
        "log_file_name": "iris.log",
    }
    X_train, y_train = load_iris(return_X_y=True)
@@ -27,4 +27,3 @@ def test_package_minimum():
    preds = automl.predict_proba(X_train)
    assert preds.shape == (150, 3)
    print(preds)
-
--- a/test/tune_example.py
+++ b/test/tune_example.py
@@ -14,10 +14,9 @@ X_train, X_test, y_train, y_test = train_test_split(
 def train_lgbm(config: dict) -> dict:
    # convert config dict to lgbm params
    params = LGBMEstimator(**config).params
-    num_boost_round = params.pop("n_estimators")
    # train the model
    train_set = lightgbm.Dataset(X_train, y_train)
-    model = lightgbm.train(params, train_set, num_boost_round)
+    model = lightgbm.train(params, train_set)
    # evaluate the model
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test, pred)
@@ -37,6 +36,14 @@ low_cost_partial_config = {
    for hp, space in flaml_lgbm_search_space.items()
    if "low_cost_init_value" in space
 }
+# initial points to evaluate
+points_to_evaluate = [
+    {
+        hp: space["init_value"]
+        for hp, space in flaml_lgbm_search_space.items()
+        if "init_value" in space
+    }
+]
 # run the tuning, minimizing mse, with total time budget 3 seconds
 analysis = tune.run(
    train_lgbm,
@@ -44,6 +51,7 @@ analysis = tune.run(
    mode="min",
    config=config_search_space,
    low_cost_partial_config=low_cost_partial_config,
+    points_to_evaluate=points_to_evaluate,
    time_budget_s=3,
    num_samples=-1,
 )
--- a/website/docs/Contribute.md
+++ b/website/docs/Contribute.md
@@ -71,7 +71,11 @@ If all the tests are passed, please also test run [notebook/automl_classificatio

 ### Documentation

-To build and test documentation locally, install [Node.js](https://nodejs.org/en/download/).
+To build and test documentation locally, install [Node.js](https://nodejs.org/en/download/). For example,
+
+```bash
+nvm install --lts
+```

 Then:

@@ -79,7 +83,7 @@ Then:
 npm install --global yarn
 pip install pydoc-markdown
 cd website
-yarn install
+yarn install --frozen-lockfile
 pydoc-markdown
 yarn start
 ```
--- a/website/docs/Examples/AutoML-NLP.md
+++ b/website/docs/Examples/AutoML-NLP.md
@@ -58,10 +58,10 @@ from flaml import AutoML
 from datasets import load_dataset

 train_dataset = (
-    load_dataset("glue", "stsb", split="train[:1%]").to_pandas().iloc[0:4]
+    load_dataset("glue", "stsb", split="train").to_pandas()
 )
 dev_dataset = (
-    load_dataset("glue", "stsb", split="train[1%:2%]").to_pandas().iloc[0:4]
+    load_dataset("glue", "stsb", split="train").to_pandas()
 )
 custom_sent_keys = ["sentence1", "sentence2"]
 label_key = "label"
@@ -86,4 +86,123 @@ automl_settings["custom_hpo_args"] = {
 automl.fit(
    X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
 )
-```
+```
+
+#### Sample output
+
+```
+[flaml.automl: 12-20 11:47:28] {1965} INFO - task = seq-regression
+[flaml.automl: 12-20 11:47:28] {1967} INFO - Data split method: uniform
+[flaml.automl: 12-20 11:47:28] {1971} INFO - Evaluation method: holdout
+[flaml.automl: 12-20 11:47:28] {2063} INFO - Minimizing error metric: rmse
+[flaml.automl: 12-20 11:47:28] {2115} INFO - List of ML learners in AutoML Run: ['transformer']
+[flaml.automl: 12-20 11:47:28] {2355} INFO - iteration 0, current learner transformer
+```
+
+### A simple summarization example
+
+```python
+from flaml import AutoML
+from datasets import load_dataset
+
+train_dataset = (
+    load_dataset("xsum", split="train").to_pandas()
+)
+dev_dataset = (
+    load_dataset("xsum", split="validation").to_pandas()
+)
+custom_sent_keys = ["document"]
+label_key = "summary"
+
+X_train = train_dataset[custom_sent_keys]
+y_train = train_dataset[label_key]
+
+X_val = dev_dataset[custom_sent_keys]
+y_val = dev_dataset[label_key]
+
+automl = AutoML()
+automl_settings = {
+    "gpu_per_trial": 1,
+    "time_budget": 20,
+    "task": "summarization",
+    "metric": "rouge1",
+}
+automl_settings["custom_hpo_args"] = {
+    "model_path": "t5-small",
+    "output_dir": "data/output/",
+    "ckpt_per_epoch": 5,
+    "fp16": False,
+}
+automl.fit(
+    X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
+)
+```
+#### Sample Output
+
+```
+[flaml.automl: 12-20 11:44:03] {1965} INFO - task = summarization
+[flaml.automl: 12-20 11:44:03] {1967} INFO - Data split method: uniform
+[flaml.automl: 12-20 11:44:03] {1971} INFO - Evaluation method: holdout
+[flaml.automl: 12-20 11:44:03] {2063} INFO - Minimizing error metric: -rouge
+[flaml.automl: 12-20 11:44:03] {2115} INFO - List of ML learners in AutoML Run: ['transformer']
+[flaml.automl: 12-20 11:44:03] {2355} INFO - iteration 0, current learner transformer
+loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /home/xliu127/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
+Model config T5Config {
+  "_name_or_path": "t5-small",
+  "architectures": [
+    "T5WithLMHeadModel"
+  ],
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "relu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "transformers_version": "4.14.1",
+  "use_cache": true,
+  "vocab_size": 32128
+}
+```
+
+For tasks that are not currently supported, use `flaml.tune` for [customized tuning](Tune-HuggingFace).
--- a/website/docs/Examples/Integrate
+++ b/website/docs/Examples/Integrate
@@ -1,4 +1,4 @@
-FLAML can be used together with AzureML and mlflow.
+FLAML can be used together with AzureML. On top of that, using mlflow and ray is easy too.

 ### Prerequisites

@@ -28,12 +28,11 @@ mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())

 ```python
 from flaml.data import load_openml_dataset
+from flaml import AutoML

 # Download [Airlines dataset](https://www.openml.org/d/1169) from OpenML. The task is to predict whether a given flight will be delayed, given the information of the scheduled departure.
 X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir="./")

-from flaml import AutoML
-
 automl = AutoML()
 settings = {
    "time_budget": 60,  # total running time in seconds
@@ -41,11 +40,134 @@ settings = {
    "task": "classification",  # task type  
    "log_file_name": "airlines_experiment.log",  # flaml log file
 }
-mlflow.set_experiment("flaml")  # the experiment name in AzureML workspace
+experiment = mlflow.set_experiment("flaml")  # the experiment name in AzureML workspace
 with mlflow.start_run() as run:  # create a mlflow run
    automl.fit(X_train=X_train, y_train=y_train, **settings)
+    mlflow.sklearn.log_model(automl, "automl")
 ```

-The metrics in the run will be automatically logged in an experiment named "flaml" in your AzureML workspace.
+The metrics in the run will be automatically logged in an experiment named "flaml" in your AzureML workspace. They can be retrieved by `mlflow.search_runs`:

-[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_azureml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_azureml.ipynb)
+```python
+mlflow.search_runs(experiment_ids=[experiment.experiment_id], filter_string="params.learner = 'xgboost'")
+```
+
+The logged model can be loaded and used to make predictions:
+```python
+automl = mlflow.sklearn.load_model(f"{run.info.artifact_uri}/automl")
+print(automl.predict(X_test))
+```
+
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_azureml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_azureml.ipynb)
+
+### Use ray to distribute across a cluster
+
+When you have a compute cluster in AzureML, you can distribute `flaml.AutoML` or `flaml.tune` with ray.
+
+#### Build a ray environment in AzureML
+
+Create a docker file such as [.Docker/Dockerfile-cpu](https://github.com/microsoft/FLAML/blob/main/test/.Docker/Dockerfile-cpu). Make sure `RUN pip install flaml[blendsearch,ray]` is included in the docker file.
+
+Then build a AzureML environment in the workspace `ws`.
+
+```python
+ray_environment_name = "aml-ray-cpu"
+ray_environment_dockerfile_path = "./Docker/Dockerfile-cpu"
+
+# Build CPU image for Ray
+ray_cpu_env = Environment.from_dockerfile(name=ray_environment_name, dockerfile=ray_environment_dockerfile_path)
+ray_cpu_env.register(workspace=ws)
+ray_cpu_build_details = ray_cpu_env.build(workspace=ws)
+
+import time
+while ray_cpu_build_details.status not in ["Succeeded", "Failed"]:
+    print(f"Awaiting completion of ray CPU environment build. Current status is: {ray_cpu_build_details.status}")
+    time.sleep(10)
+```
+
+You only need to do this step once for one workspace.
+
+#### Create a compute cluster with multiple nodes
+
+```python
+from azureml.core.compute import AmlCompute, ComputeTarget
+
+compute_target_name = "cpucluster"
+node_count = 2
+
+# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
+compute_target_size = "STANDARD_D2_V2"
+
+if compute_target_name in ws.compute_targets:
+    compute_target = ws.compute_targets[compute_target_name]
+    if compute_target and type(compute_target) is AmlCompute:
+        if compute_target.provisioning_state == "Succeeded":
+            print("Found compute target; using it:", compute_target_name)
+        else:
+            raise Exception(
+                "Found compute target but it is in state", compute_target.provisioning_state)
+else:
+    print("creating a new compute target...")
+    provisioning_config = AmlCompute.provisioning_configuration(
+        vm_size=compute_target_size,
+        min_nodes=0,
+        max_nodes=node_count)
+
+    # Create the cluster
+    compute_target = ComputeTarget.create(ws, compute_target_name, provisioning_config)
+
+    # Can poll for a minimum number of nodes and for a specific timeout.
+    # If no min node count is provided it will use the scale settings for the cluster
+    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
+
+    # For a more detailed view of current AmlCompute status, use get_status()
+    print(compute_target.get_status().serialize())
+```
+
+If the computer target "cpucluster" already exists, it will not be recreated.
+
+#### Run distributed AutoML job
+
+Assuming you have an automl script like [ray/distribute_automl.py](https://github.com/microsoft/FLAML/blob/main/test/ray/distribute_automl.py). It uses `ray.init(address="auto")` to initialize the cluster, and uses `n_concurrent_trials=k` to inform `AutoML.fit()` to perform k concurrent trials in parallel.
+
+Submit an AzureML job as the following:
+
+```python
+from azureml.core import Workspace, Experiment, ScriptRunConfig, Environment
+
+command = ["python distribute_automl.py"]
+ray_environment_name = 'aml-ray-cpu'
+env = Environment.get(workspace=ws, name=ray_environment_name)
+config = ScriptRunConfig(
+    source_directory='ray/',
+    command=command,
+    compute_target=compute_target,
+    environment=env,
+)
+
+config.run_config.node_count = 2
+config.run_config.environment_variables["_AZUREML_CR_START_RAY"] = "true"
+config.run_config.environment_variables["AZUREML_COMPUTE_USE_COMMON_RUNTIME"] = "true"
+
+exp = Experiment(ws, 'distribute-automl')
+run = exp.submit(config)
+
+print(run.get_portal_url())  # link to ml.azure.com
+run.wait_for_completion(show_output=True)
+```
+
+The line
+`
+config.run_config.environment_variables["_AZUREML_CR_START_RAY"] = "true"
+`
+tells AzureML to start ray on each node of the cluster.
+
+#### Run distributed tune job
+
+Prepare a script like [ray/distribute_tune.py](https://github.com/microsoft/FLAML/blob/main/test/ray/distribute_tune.py). Replace the command in the above eample with:
+
+```python
+command = ["python distribute_tune.py"]
+```
+
+Everything else is the same.
--- a/website/docs/Examples/Tune-HuggingFace.md
+++ b/website/docs/Examples/Tune-HuggingFace.md
@@ -2,6 +2,10 @@

 This example uses flaml to finetune a transformer model from Huggingface transformers library.

+*Note*: `flaml.AutoML` has built-in support for certain finetuning tasks with a
+[higher-level API](AutoML-NLP).
+It may be easier to use that API unless you have special requirements not handled by that API.
+
 ### Requirements

 This example requires GPU. Install dependencies:
--- a/website/docs/Getting-Started.md
+++ b/website/docs/Getting-Started.md
@@ -49,10 +49,9 @@ from flaml.model import LGBMEstimator
 def train_lgbm(config: dict) -> dict:
    # convert config dict to lgbm params
    params = LGBMEstimator(**config).params
-    num_boost_round = params.pop("n_estimators")
    # train the model
    train_set = lightgbm.Dataset(X_train, y_train)
-    model = lightgbm.train(params, train_set, num_boost_round)
+    model = lightgbm.train(params, train_set)
    # evaluate the model
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test, pred)
@@ -75,6 +74,7 @@ analysis = tune.run(
    low_cost_partial_config=low_cost_partial_config, time_budget_s=3, num_samples=-1,
 )
 ```
+Please see this [script](https://github.com/microsoft/FLAML/blob/main/test/tune_example.py) for the complete version of the above example.

 ### Where to Go Next?

--- a/website/docs/Use-Cases/Task-Oriented-AutoML.md
+++ b/website/docs/Use-Cases/Task-Oriented-AutoML.md
@@ -14,6 +14,7 @@
    - 'rank': learning to rank.
    - 'seq-classification': sequence classification.
    - 'seq-regression': sequence regression.
+    - 'summarization': text summarization.

 An optional input is `time_budget` for searching models and hyperparameters. When not specified, a default budget of 60 seconds will be used.

@@ -302,6 +303,10 @@ By default, flaml uses the following method to split the data:

 The data split method for classification can be changed into uniform split by setting `split_type="uniform"`. For both classification and regression, time-based split can be enforced if the data are sorted by timestamps, by setting `split_type="time"`.

+When `eval_method="cv"`, `split_type` can also be set as a custom splitter. It needs to be an instance of a derived class of scikit-learn
+[KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
+and have ``split`` and ``get_n_splits`` methods with the same signatures.
+
 ### Parallel tuning

 When you have parallel resources, you can either spend them in training and keep the model search sequential, or perform parallel search. Following scikit-learn, the parameter `n_jobs` specifies how many CPU cores to use for each training job. The number of parallel trials is specified via the parameter `n_concurrent_trials`. By default, `n_jobs=-1, n_concurrent_trials=1`. That is, all the CPU cores (in a single compute node) are used for training a single model and the search is sequential. When you have more resources than what each single training job needs, you can consider increasing `n_concurrent_trials`.
@@ -462,9 +467,9 @@ The curve suggests that increasing the time budget may further improve the accur
 1. set t1 as the time budget, and check the message in the console log in the end. If the budget is too small, you will see a warning like
 > WARNING - Time taken to find the best model is 91% of the provided time budget and not all estimators' hyperparameter search converged. Consider increasing the time budget.
 2. set t2 as the time budget, and also set `early_stop=True`. If the early stopping is triggered, you will see a warning like
-    > WARNING - All estimator hyperparameters local search has converged at least once, and the total search time exceeds 10 times the time taken to find the best model.
+> WARNING - All estimator hyperparameters local search has converged at least once, and the total search time exceeds 10 times the time taken to find the best model.

-    > WARNING - Stopping search as early_stop is set to True.
+> WARNING - Stopping search as early_stop is set to True.

 ### How much time is needed to find the best model

--- a/website/docs/Use-Cases/Tune-User-Defined-Function.md
+++ b/website/docs/Use-Cases/Tune-User-Defined-Function.md
@@ -436,13 +436,12 @@ analysis = tune.run(

 ### Reproducibility

-By default, there is randomness in our tuning process. If reproducibility is desired, you could
-manually set a random seed before calling `tune.run()`. For example, in the following code, we call `np.random.seed(100)` to set the random seed.
-With this random seed, running the following code multiple times will generate exactly the same search trajectory.
+By default, there is randomness in our tuning process (for versions <= 0.9.1). If reproducibility is desired, you could manually set a random seed before calling `tune.run()`. For example, in the following code, we call `np.random.seed(100)` to set the random seed.
+With this random seed, running the following code multiple times will generate exactly the same search trajectory. The reproducibility can only be guaranteed in sequential tuning.

 ```python
 import numpy as np
-np.random.seed(100)
+np.random.seed(100)  # This line is not needed starting from version v0.9.2.
 analysis = tune.run(
    simple_obj,
    config=config_search_space,
--- a/website/docusaurus.config.js
+++ b/website/docusaurus.config.js
@@ -95,7 +95,7 @@ module.exports = {
          sidebarPath: require.resolve('./sidebars.js'),
          // Please change this to your repo.
          editUrl:
-            'https://github.com/microsoft/FLAML/edit/master/website/',
+            'https://github.com/microsoft/FLAML/edit/main/website/',
          remarkPlugins: [math],
          rehypePlugins: [katex],
        },
--- a/website/sidebars.js
+++ b/website/sidebars.js
@@ -11,7 +11,7 @@

 module.exports = {
  docsSidebar: [
-    'Getting-Started', 
+    'Getting-Started',
    'Installation',
    {'Use Cases': [{type: 'autogenerated', dirName: 'Use-Cases'}]},
    {'Examples': [{type: 'autogenerated', dirName: 'Examples'}]},