autohf (#43)

automate huggingface transformer
2026-04-20 03:02:16 -04:00 · 2021-06-09 11:37:03 -04:00
parent e031c2eb7d
commit a4049ad9b6
29 changed files with 4316 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -153,3 +153,5 @@ notebook/.azureml
 mlruns
 logs
 automl.pkl
+
+.idea/*
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -38,6 +38,13 @@ Tune
   :members:


+NLP
+------
+
+.. autoclass:: flaml.nlp.AutoTransformers
+   :members:
+
+
 .. Indices and tables
 .. ==================

--- a/flaml/nlp/README.md
+++ b/flaml/nlp/README.md
@@ -0,0 +1,32 @@
+How to use AutoTransformers:
+
+```python
+from flaml.nlp.autotransformers import AutoTransformers
+
+autohf = AutoTransformers()
+preparedata_setting = {
+        "dataset_subdataset_name": "glue:rte",
+        "pretrained_model_size": "electra-base-discriminator:base",
+        "data_root_path": "data/",
+        "max_seq_length": 128,
+        }
+autohf.prepare_data(**preparedata_setting)
+autohf_settings = {"resources_per_trial": {"gpu": 1, "cpu": 1},
+                    "num_samples": -1, # unlimited sample size
+                    "time_budget": 3600,
+                    "ckpt_per_epoch": 1,
+                    "fp16": False,
+                   }
+validation_metric, analysis = \
+    autohf.fit(**autohf_settings,)
+
+```
+
+The current use cases that are supported:
+1. A simplified version of fine-tuning the GLUE dataset using HuggingFace;
+2. For selecting better search space for fine-tuning the GLUE dataset;
+3. Use the search algorithms in flaml for more efficient fine-tuning of HuggingFace;
+
+The use cases that can be supported in future:
+1. HPO fine-tuning for text generation;
+2. HPO fine-tuning for question answering;
--- a/flaml/nlp/init.py
+++ b/flaml/nlp/init.py
@@ -0,0 +1,2 @@
+from flaml.nlp.autotransformers import AutoTransformers
+from flaml.nlp.result_analysis.azure_utils import AzureUtils, JobID
--- a/flaml/nlp/autotransformers.py
+++ b/flaml/nlp/autotransformers.py
@@ -0,0 +1,852 @@
+import json
+import os
+
+import torch
+import transformers
+import wandb
+
+from .dataset.dataprocess_auto import AutoEncodeText
+import numpy as np
+
+from ray.tune import CLIReporter
+
+import time
+import ray
+import datasets
+from datasets import load_dataset
+from transformers.trainer_utils import IntervalStrategy, HPSearchBackend
+
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, TrainingArguments
+
+from .dataset.metric_auto import get_default_and_alternative_metric
+from .dataset.submission_auto import auto_output_prediction
+from .dataset.task_auto import get_default_task
+from .hpo.grid_searchspace_auto import AutoGridSearchSpace
+from .hpo.hpo_searchspace import AutoHPOSearchSpace
+from .huggingface.switch_head_auto import AutoSeqClassificationHead, MODEL_CLASSIFICATION_HEAD_MAPPING
+from .utils import PathUtils, _variable_override_default_alternative
+from .hpo.searchalgo_auto import AutoSearchAlgorithm
+from .hpo.scheduler_auto import AutoScheduler
+from .result_analysis.wandb_utils import WandbUtils
+from .result_analysis.azure_utils import JobID
+from .utils import load_console_args
+
+from .huggingface.trainer import TrainerForAutoTransformers
+
+import logging
+
+transformers.logging.set_verbosity_error()
+logger = logging.getLogger(__name__)
+logger_formatter = logging.Formatter(
+    '[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s',
+    '%m-%d %H:%M:%S')
+
+task_list = [
+    "seq-classification",
+    "regression",
+    "question-answering"
+]
+
+
+class AutoTransformers:
+    '''The AutoTransformers class
+
+    Example:
+
+        .. code-block:: python
+
+            autohf = AutoTransformers()
+            autohf_settings = {"resources_per_trial": {"cpu": 1},
+                       "num_samples": -1,
+                       "time_budget": 100000,
+                       "ckpt_per_epoch": 1,
+                       "fp16": False,
+                      }
+
+            validation_metric, analysis = autohf.fit(**autohf_settings)
+
+    '''
+
+    @staticmethod
+    def _convert_dict_to_ray_tune_space(config_json, mode="grid"):
+        search_space = {}
+
+        if mode == "grid":
+            for each_hp in config_json.keys():
+                this_config = config_json[each_hp]
+                assert isinstance(this_config, dict) or isinstance(this_config, list), \
+                    "config of " + each_hp + " must be dict or list"
+                search_space[each_hp] = ray.tune.grid_search(this_config)
+        else:
+            for each_hp in config_json.keys():
+                this_config = config_json[each_hp]
+                assert isinstance(this_config, dict) or isinstance(this_config, list), \
+                    "config of " + each_hp + " must be dict or list"
+                if isinstance(this_config, dict):
+                    lower = this_config["l"]
+                    upper = this_config["u"]
+                    space = this_config["space"]
+                    if space == "log":
+                        search_space[each_hp] = ray.tune.loguniform(lower, upper)
+                    elif space == "linear":
+                        search_space[each_hp] = ray.tune.uniform(lower, upper)
+                    elif space == "quniform":
+                        search_space[each_hp] = ray.tune.quniform(lower, upper, this_config["interval"])
+                else:
+                    search_space[each_hp] = ray.tune.choice(this_config)
+
+        return search_space
+
+    def _set_search_space(self,
+                          **custom_hpo_args):
+        search_space_dict_hpo = search_space_dict_grid = None
+        if self.jobid_config.mod == "grid":
+            search_space_grid_json = AutoGridSearchSpace.from_model_and_dataset_name(self.jobid_config.pre,
+                                                                                     self.jobid_config.presz,
+                                                                                     self.get_full_data_name(),
+                                                                                     self.jobid_config.subdat, "grid")
+            search_space_dict_grid \
+                = AutoTransformers._convert_dict_to_ray_tune_space(search_space_grid_json, mode="grid")
+            search_space_dict_hpo = search_space_dict_grid
+        if self.jobid_config.mod != "grid" and self.jobid_config.mod != "gridbert":
+            search_space_hpo_json \
+                = AutoHPOSearchSpace.from_model_and_dataset_name(logger,
+                                                                 self.jobid_config.spa,
+                                                                 self.jobid_config.pre,
+                                                                 self.jobid_config.presz,
+                                                                 self.get_full_data_name(),
+                                                                 self.jobid_config.subdat,
+                                                                 **custom_hpo_args)
+            search_space_dict_hpo = AutoTransformers._convert_dict_to_ray_tune_space(search_space_hpo_json, mode="hpo")
+        elif self.jobid_config.mod == "gridbert":
+            search_space_hpo_json = AutoGridSearchSpace.from_model_and_dataset_name(
+                "bert",
+                "base",
+                self.get_full_data_name(),
+                self.jobid_config.subdat, "grid")
+            search_space_dict_hpo = AutoTransformers._convert_dict_to_ray_tune_space(search_space_hpo_json, mode="grid")
+
+        """
+            resolve the conflict in search_space_dict_hpo: only one of "max_steps" and "num_train_epochs" can exist
+            in the search space. If both exists, num_train_epochs is removed. Similarly, if "warmup_steps" and
+            "warmup_ratio" both exist, warmup_ratio is removed
+        """
+        search_space_dict_hpo = TrainerForAutoTransformers.resolve_hp_conflict(search_space_dict_hpo)
+        self._search_space_hpo = search_space_dict_hpo
+        if self.jobid_config.mod == "grid":
+            search_space_dict_grid = TrainerForAutoTransformers.resolve_hp_conflict(search_space_dict_grid)
+            self._search_space_grid = search_space_dict_grid
+        else:
+            self._search_space_grid = None
+
+        try:
+            self.ds_config = custom_hpo_args["ds_config"]
+        except KeyError:
+            self.ds_config = None
+
+    def _wrapper(self, func, *args):  # with star
+        return func(*args)
+
+    def _get_split_name(self, data_raw, fold_name=None):
+        if fold_name:
+            return fold_name
+        fold_keys = data_raw.keys()
+        if fold_keys == {"train", "validation", "test"}:
+            return "train", "validation", "test"
+        for each_key in fold_keys:
+            for each_split_name in {"train", "validation", "test"}:
+                assert not (each_key.startswith(each_split_name) and each_key != each_split_name), \
+                    "Dataset split must be within {}, must be explicitly specified in dataset_config, e.g.," \
+                    "'fold_name': ['train', 'validation_matched', 'test_matched']. Please refer to the example in the " \
+                    "documentation of AutoTransformers.prepare_data()".format(",".join(fold_keys))
+        return "train", "validation", "test"
+
+    def prepare_data(self,
+                     data_root_path,
+                     jobid_config=None,
+                     is_wandb_on=False,
+                     server_name=None,
+                     max_seq_length=128,
+                     fold_name=None,
+                     resplit_portion=None,
+                     **custom_data_args):
+        '''Prepare data
+
+            An example:
+
+                preparedata_setting = {
+                "server_name": "tmdev",
+                "data_root_path": "data/",
+                "max_seq_length": 128,
+                "jobid_config": jobid_config,
+                "wandb_utils": wandb_utils,
+                "resplit_portion": {"source": ["train", "validation"],
+                "train": [0, 0.8], "validation": [0.8, 0.9], "test": [0.9, 1.0]}
+                }
+                autohf.prepare_data(**preparedata_setting)
+
+            Args:
+                server_name:
+                    a string variable, which can be tmdev or azureml
+                data_root_path:
+                    the root path for storing the checkpoints and output results, e.g., "data/"
+                jobid_config:
+                    a JobID object describing the profile of job
+                wandb_utils:
+                    a WandbUtils object for wandb operations
+                max_seq_length (optional):
+                    max_seq_lckpt_per_epochength for the huggingface, this hyperparameter must be specified
+                    at the data processing step
+                resplit_portion:
+                    the proportion for resplitting the train and dev data when split_mode="resplit".
+                    If args.resplit_mode = "rspt", resplit_portion is required
+            '''
+        console_args = load_console_args(**custom_data_args)
+        self._max_seq_length = max_seq_length
+        self._server_name = server_name if server_name is not None else "tmdev"
+        self.jobid_config = jobid_config if jobid_config is not None else JobID(console_args)
+        self.wandb_utils = WandbUtils(is_wandb_on=is_wandb_on,
+                                      console_args=console_args,
+                                      jobid_config=self.jobid_config)
+        self.wandb_utils.set_wandb_per_run()
+
+        self.path_utils = PathUtils(self.jobid_config, hpo_data_root_path=data_root_path)
+
+        if self.jobid_config.spt == "rspt":
+            assert resplit_portion, "If split mode is 'rspt', the resplit_portion must be provided. Please " \
+                                    "refer to the example in the documentation of AutoTransformers.prepare_data()"
+        if self.jobid_config.subdat:
+            data_raw = load_dataset(self.get_full_data_name(), self.jobid_config.subdat)
+        else:
+            data_raw = self._wrapper(load_dataset, *self.jobid_config.dat)
+
+        self._train_name, self._dev_name, self._test_name = self._get_split_name(data_raw, fold_name=fold_name)
+        auto_tokentoids_config = {"max_seq_length": self._max_seq_length}
+        self._tokenizer = AutoTokenizer.from_pretrained(self.jobid_config.pre_full, use_fast=True)
+
+        def autoencodetext_from_model_and_dataset_name():
+            return AutoEncodeText.from_model_and_dataset_name(
+                data_raw,
+                self.jobid_config.pre_full,
+                self.get_full_data_name(),
+                self.jobid_config.subdat,
+                **auto_tokentoids_config)
+
+        data_encoded = autoencodetext_from_model_and_dataset_name()
+        self._max_seq_length = 0
+        """
+            Update the max_seq_length to the minimum of the actual max seq length and the user defined max_seq_length
+        """
+        for each_fold in data_encoded.keys():
+            self._max_seq_length = max(self._max_seq_length,
+                                       max([sum(data_encoded[each_fold][x]['attention_mask']) for x in
+                                            range(len(data_encoded[each_fold]))]))
+        self._max_seq_length = int((self._max_seq_length + 15) / 16) * 16
+        data_encoded = autoencodetext_from_model_and_dataset_name()
+
+        if self.jobid_config.spt == "rspt":
+            all_folds_from_source = []
+            assert "source" in resplit_portion.keys(), "Must specify the source for resplitting the dataset in" \
+                                                       "resplit_portion, which is a list of folder names, e.g., resplit_portion = {'source': ['train']}"
+
+            source_fold_names = resplit_portion['source']
+            for each_fold_name in source_fold_names:
+                this_fold_dataset = data_encoded[each_fold_name]
+                all_folds_from_source.append(this_fold_dataset)
+
+            merged_folds_from_source = datasets.concatenate_datasets(all_folds_from_source)
+            merged_folds_from_source = merged_folds_from_source.shuffle(seed=self.jobid_config.sddt)
+
+            assert "train" in resplit_portion.keys() and "validation" in resplit_portion.keys() \
+                   and "test" in resplit_portion.keys(), "train, validation, test must exist in resplit_portion"
+
+            for key in ["train", "validation", "test"]:
+                target_fold_start, target_fold_end = \
+                    int(resplit_portion[key][0] * len(merged_folds_from_source)), \
+                    int(resplit_portion[key][1] * len(merged_folds_from_source))
+                subfold_dataset = merged_folds_from_source.select(
+                    [x for x in range(target_fold_start, target_fold_end)]).flatten_indices()
+                if key == "train":
+                    self.train_dataset = subfold_dataset
+                elif key == "validation":
+                    self.eval_dataset = subfold_dataset
+                else:
+                    self.test_dataset = subfold_dataset
+        else:
+            self.train_dataset, self.eval_dataset, self.test_dataset \
+                = data_encoded[self._train_name], data_encoded[self._dev_name], data_encoded[self._test_name]
+
+    def _load_model(self,
+                    checkpoint_path=None,
+                    per_model_config=None):
+
+        this_task = get_default_task(self.get_full_data_name(), self.jobid_config.subdat)
+        if this_task == "seq-classification":
+            self._num_labels = len(self.train_dataset.features["label"].names)
+        elif this_task == "regression":
+            self._num_labels = 1
+
+        if not checkpoint_path:
+            checkpoint_path = self.jobid_config.pre_full
+
+        def get_this_model():
+            return AutoModelForSequenceClassification.from_pretrained(checkpoint_path, config=model_config)
+
+        def is_pretrained_model_in_classification_head_list():
+            return self.jobid_config.pre in MODEL_CLASSIFICATION_HEAD_MAPPING.keys()
+
+        def _set_model_config():
+            if per_model_config and len(per_model_config) > 0:
+                model_config = AutoConfig.from_pretrained(
+                    checkpoint_path,
+                    num_labels=model_config_num_labels,
+                    **per_model_config)
+            else:
+                model_config = AutoConfig.from_pretrained(
+                    checkpoint_path,
+                    num_labels=model_config_num_labels)
+            return model_config
+
+        if this_task == "seq-classification":
+            num_labels_old = AutoConfig.from_pretrained(checkpoint_path).num_labels
+            if is_pretrained_model_in_classification_head_list():
+                model_config_num_labels = num_labels_old
+            else:
+                model_config_num_labels = self._num_labels
+            model_config = _set_model_config()
+
+            if is_pretrained_model_in_classification_head_list():
+                if self._num_labels != num_labels_old:
+                    this_model = get_this_model()
+                    model_config.num_labels = self._num_labels
+                    this_model.num_labels = self._num_labels
+                    this_model.classifier = AutoSeqClassificationHead \
+                        .from_model_type_and_config(self.jobid_config.pre,
+                                                    model_config)
+                else:
+                    this_model = get_this_model()
+            else:
+                this_model = get_this_model()
+
+            this_model.resize_token_embeddings(len(self._tokenizer))
+            return this_model
+        elif this_task == "regression":
+            model_config = self._set_model_config(checkpoint_path, per_model_config, 1)
+            this_model = get_this_model()
+            return this_model
+
+    def _get_metric_func(self):
+        if self.get_full_data_name() in ("glue", "super_glue"):
+            metric = datasets.load.load_metric(self.get_full_data_name(), self.jobid_config.subdat)
+        elif self.get_full_data_name() in ("squad", "squad_v2"):
+            metric = datasets.load.load_metric(self.get_full_data_name())
+        else:
+            metric = datasets.load.load_metric(self.metric_name)
+        return metric
+
+    def _compute_metrics_by_dataset_name(self,
+                                         eval_pred):
+        predictions, labels = eval_pred
+        predictions = np.squeeze(predictions) \
+            if self.task_name == "regression" else np.argmax(predictions, axis=1)
+        metric_func = self._get_metric_func()
+        return metric_func.compute(predictions=predictions, references=labels)
+
+    def _compute_checkpoint_freq(self,
+                                 num_train_epochs,
+                                 batch_size):
+        if "gpu" in self._resources_per_trial:
+            ckpt_step_freq = int(min(num_train_epochs, 1) * len(self.train_dataset) / batch_size
+                                 / self._resources_per_trial["gpu"] / self.ckpt_per_epoch) + 1
+        else:
+            ckpt_step_freq = int(min(num_train_epochs, 1) * len(self.train_dataset) / batch_size
+                                 / self._resources_per_trial["cpu"] / self.ckpt_per_epoch) + 1
+
+        return ckpt_step_freq
+
+    @staticmethod
+    def _separate_config(config):
+        training_args_config = {}
+        per_model_config = {}
+
+        for key in config.keys():
+            if key in TrainingArguments.__dict__.keys():
+                training_args_config[key] = config[key]
+            else:
+                per_model_config[key] = config[key]
+
+        return training_args_config, per_model_config
+
+    def _objective(self, config, reporter, checkpoint_dir=None):
+        def model_init():
+            return self._load_model()
+
+        from transformers.trainer_utils import set_seed
+        set_seed(config["seed"])
+
+        training_args_config, per_model_config = AutoTransformers._separate_config(config)
+        this_model = self._load_model(per_model_config=per_model_config)
+
+        trial_id = reporter.trial_id
+        self.path_utils.make_dir_per_trial(trial_id)
+
+        ckpt_freq = self._compute_checkpoint_freq(
+            num_train_epochs=config["num_train_epochs"],
+            batch_size=config["per_device_train_batch_size"])
+
+        assert self.path_utils.ckpt_dir_per_trial
+        training_args = TrainingArguments(
+            output_dir=self.path_utils.ckpt_dir_per_trial,
+            do_eval=False,
+            per_device_eval_batch_size=32,
+            eval_steps=ckpt_freq,
+            evaluation_strategy=IntervalStrategy.STEPS,
+            save_steps=ckpt_freq,
+            save_total_limit=0,
+            fp16=self._fp16,
+            deepspeed=self.ds_config,
+            **training_args_config,
+        )
+
+        trainer = TrainerForAutoTransformers(
+            this_model,
+            training_args,
+            model_init=model_init,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            tokenizer=self._tokenizer,
+            compute_metrics=self._compute_metrics_by_dataset_name,
+        )
+        trainer.logger = logger
+        trainer.trial_id = reporter.trial_id
+
+        """
+            create a wandb run. If os.environ["WANDB_MODE"] == "offline", run = None
+        """
+        run = self.wandb_utils.set_wandb_per_trial()
+        if os.environ["WANDB_MODE"] == "online":
+            for each_hp in config:
+                wandb.log({each_hp: config[each_hp]})
+        trainer.train()
+        trainer.evaluate(self.eval_dataset)
+        """
+            If a wandb run was created, close the run after train and evaluate finish
+        """
+        if run:
+            run.finish()
+
+    def _verify_init_config(self,
+                            **custom_hpo_args):
+        for key in custom_hpo_args.keys():
+            if key == "points_to_evaluate":
+                for each_init_config in custom_hpo_args[key]:
+                    for each_hp in each_init_config.keys():
+                        assert each_hp in self._search_space_hpo.keys(), \
+                            "points_to_evaluate hp must be within the search space"
+
+                        assert isinstance(each_init_config[each_hp], int) or \
+                               isinstance(each_init_config[each_hp], float) or \
+                               isinstance(each_init_config[each_hp], str) or \
+                               isinstance(each_init_config[each_hp], bool), " points_to_evaluate must be a scalar"
+
+                        assert isinstance(self._search_space_hpo[each_hp], ray.tune.sample.Categorical) or \
+                               isinstance(self._search_space_hpo[each_hp], ray.tune.sample.Float) or \
+                               isinstance(self._search_space_hpo[each_hp], ray.tune.sample.Integer), \
+                               "Every hp space must either be categorical, integer or float"
+
+                        if isinstance(self._search_space_hpo[each_hp], ray.tune.sample.Categorical):
+                            assert each_init_config[each_hp] in self._search_space_hpo[each_hp].categories, \
+                                "points_to_evaluate {each_hp} value must be within the search space"
+                        else:
+                            assert self._search_space_hpo[each_hp].lower <= each_init_config[each_hp] <= \
+                                   self._search_space_hpo[each_hp].upper, \
+                                   "points_to_evaluate {each_hp} value must be within the search space"
+
+    def _get_search_algo(self,
+                         search_algo_name,
+                         search_algo_args_mode,
+                         **custom_hpo_args):
+        if search_algo_name in ("bs", "cfo"):
+            self._verify_init_config(**custom_hpo_args)
+        search_algo = AutoSearchAlgorithm.from_method_name(
+            search_algo_name,
+            search_algo_args_mode,
+            self._search_space_hpo,
+            **custom_hpo_args)
+        return search_algo
+
+    @staticmethod
+    def _recover_checkpoint(tune_checkpoint_dir):
+        assert tune_checkpoint_dir
+        # Get subdirectory used for Huggingface.
+        subdirs = [
+            os.path.join(tune_checkpoint_dir, name)
+            for name in os.listdir(tune_checkpoint_dir)
+            if os.path.isdir(os.path.join(tune_checkpoint_dir, name))
+        ]
+        # There should only be 1 subdir.
+        assert len(subdirs) == 1, subdirs
+        return subdirs[0]
+
+    def get_full_data_name(self):
+        return JobID.dataset_list_to_str(self.jobid_config.dat, "dat")
+
+    def _save_ckpt_json(self,
+                        best_ckpt):
+        json.dump({"best_ckpt": best_ckpt},
+                  open(os.path.join(self.path_utils.result_dir_per_run,
+                                    "save_ckpt_" + self.jobid_config.to_jobid_string() + ".json"), "w"))
+
+    def _save_output_metric(self,
+                            output_metrics):
+        json.dump(output_metrics, open(
+            os.path.join(self.path_utils.result_dir_per_run,
+                         "output_metric_" + self.jobid_config.to_jobid_string() + ".json"), "w"))
+
+    def _load_ckpt_json(self,
+                        ckpt_dir=None,
+                        **kwargs):
+        if not ckpt_dir:
+            ckpt_dir = os.path.join(self.path_utils.result_dir_per_run,
+                                    "save_ckpt_" + self.jobid_config.to_jobid_string() + ".json")
+        try:
+            ckpt_json = json.load(open(ckpt_dir))
+            return ckpt_json["best_ckpt"]
+        except FileNotFoundError as err:
+            logger.error("Saved checkpoint not found. Please make sure checkpoint is stored under {}".format(ckpt_dir))
+            raise err
+
+    def _set_metric(self, custom_metric_name=None, custom_metric_mode_name=None):
+        default_metric, default_mode, all_metrics, all_modes = get_default_and_alternative_metric(
+            self.get_full_data_name(),
+            subdataset_name=self.jobid_config.subdat,
+            custom_metric_name=custom_metric_name,
+            custom_metric_mode_name=custom_metric_mode_name)
+        _variable_override_default_alternative(logger,
+                                               self,
+                                               "metric_name",
+                                               default_metric,
+                                               all_metrics,
+                                               custom_metric_name)
+        _variable_override_default_alternative(logger,
+                                               self,
+                                               "metric_mode_name",
+                                               default_mode,
+                                               all_modes,
+                                               custom_metric_mode_name)
+        self._all_metrics = all_metrics
+        self._all_modes = all_modes
+
+    def _set_task(self):
+        self.task_name = get_default_task(self.get_full_data_name(), self.jobid_config.subdat)
+
+    def fit_hf(self,
+               resources_per_trial,
+               num_samples,
+               time_budget,
+               custom_metric_name=None,
+               custom_metric_mode_name=None,
+               _fp16=True,
+               **custom_hpo_args
+               ):
+        '''Fine tuning the huggingface using HF's API Transformers.hyperparameter_search (for comparitive purpose).
+           Transformers.hyperparameter_search has the following disadvantages:
+             (1) it does not return tune.analysis.Analysis result, what is analysis used for
+             (2) it is inconvenient to develop on top of Transformers.hyperparameter_search, whose trainable function,
+             search space, etc. are defined inside of Transformers.hyperparameter_search.
+
+                An example:
+                    autohf_settings = {"resources_per_trial": {"cpu": 1},
+                               "num_samples": 1,
+                               "time_budget": 100000,
+                               "ckpt_per_epoch": 1,
+                               "fp16": False,
+                              }
+                    validation_metric, analysis = autohf.fit(**autohf_settings,)
+
+                Args:
+                    resources_per_trial:
+                        A dict showing the resources used by each trial,
+                        e.g., {"gpu": 4, "cpu": 4}
+                    num_samples:
+                        An int variable of the maximum number of trials
+                    time_budget:
+                        An int variable of the maximum time budget
+                    custom_metric_name:
+                        A string of the dataset name or a function,
+                        e.g., 'accuracy', 'f1', 'loss',
+                    custom_metric_mode_name:
+                        A string of the mode name,
+                        e.g., "max", "min", "last", "all"
+                    fp16:
+                        boolean, default = True | whether to use fp16
+                    custom_hpo_args:
+                        The additional keyword arguments, e.g.,
+                        custom_hpo_args = {"points_to_evaluate": [{
+                                   "num_train_epochs": 1,
+                                   "per_device_train_batch_size": 128, }]}
+
+                Returns:
+                   validation_metric:
+                        a dict storing the validation score
+                '''
+
+        def model_init():
+            return self._load_model()
+
+        def ray_hp_space(trial):
+            return {
+                "learning_rate": ray.tune.loguniform(1e-6, 1e-4),
+                "num_train_epochs": ray.tune.choice(list(range(1, 6))),
+                "seed": ray.tune.quniform(1, 41, 1),
+                "per_device_train_batch_size": ray.tune.choice([4, 8, 16, 32, 64]),
+            }
+
+        self._set_metric(custom_metric_name, custom_metric_mode_name)
+        self._set_task()
+
+        training_args = TrainingArguments(
+            output_dir=self.path_utils.hpo_ckpt_path,
+            fp16=_fp16,
+        )
+        this_model = self._load_model()
+
+        trainer = TrainerForAutoTransformers(
+            this_model,
+            training_args,
+            model_init=model_init,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            tokenizer=self._tokenizer,
+            compute_metrics=self._compute_metrics_by_dataset_name,
+        )
+        self.path_utils.make_dir_per_run()
+
+        start_time = time.time()
+        best_run = trainer.hyperparameter_search(
+            n_trials=num_samples,
+            time_budget_s=time_budget,
+            hp_space=ray_hp_space,
+            backend=HPSearchBackend.RAY,
+            resources_per_trial=resources_per_trial)
+        duration = time.time() - start_time
+        self.last_run_duration = duration
+
+        hp_dict = best_run.hyperparameters
+        hp_dict["seed"] = int(hp_dict["seed"])
+
+        best_training_args = TrainingArguments(
+            output_dir=self.path_utils.hpo_ckpt_path,
+            fp16=_fp16,
+            **hp_dict,
+        )
+
+        best_trainer = TrainerForAutoTransformers(
+            this_model,
+            best_training_args,
+            model_init=model_init,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            tokenizer=self._tokenizer,
+            compute_metrics=self._compute_metrics_by_dataset_name,
+        )
+
+        best_model_checkpoint_path = os.path.join(self.path_utils.hpo_ckpt_path, "hpo_hf")
+        if not os.path.exists(best_model_checkpoint_path):
+            os.mkdir(best_model_checkpoint_path)
+        best_trainer.train()
+        best_trainer.save_model(best_model_checkpoint_path)
+        self._save_ckpt_json(best_model_checkpoint_path)
+        validation_metric = best_trainer.evaluate()
+
+        return validation_metric
+
+    def fit(self,
+            num_samples,
+            time_budget,
+            custom_metric_name=None,
+            custom_metric_mode_name=None,
+            ckpt_per_epoch=1,
+            fp16=True,
+            verbose=1,
+            resources_per_trial={"gpu": 1, "cpu": 1},
+            **custom_hpo_args):
+        '''Fine tuning the huggingface using the hpo setting
+
+        An example:
+            autohf_settings = {"resources_per_trial": {"cpu": 1},
+                       "num_samples": 1,
+                       "time_budget": 100000,
+                       "ckpt_per_epoch": 1,
+                       "fp16": False,
+                      }
+            validation_metric, analysis = autohf.fit(**autohf_settings)
+
+        Args:
+            resources_per_trial:
+                A dict showing the resources used by each trial,
+                e.g., {"gpu": 4, "cpu": 4}
+            num_samples:
+                An int variable of the maximum number of trials
+            time_budget:
+                An int variable of the maximum time budget
+            custom_metric_name:
+                A string of the dataset name or a function,
+                e.g., 'accuracy', 'f1', 'loss'
+            custom_metric_mode_name:
+                A string of the mode name,
+                e.g., "max", "min", "last", "all"
+            ckpt_per_epoch:
+                An integer value of number of checkpoints per epoch, default = 1
+            verbose:
+                int, default=1 | Controls the verbosity, higher means more
+                messages
+            fp16:
+                boolean, default = True | whether to use fp16
+            custom_hpo_args:
+                The additional keyword arguments, e.g.,
+                custom_hpo_args = {"points_to_evaluate": [{
+                           "num_train_epochs": 1,
+                           "per_device_train_batch_size": 128, }]}
+
+        Returns:
+           validation_metric:
+                a dict storing the validation score
+           analysis:
+                a ray.tune.analysis.Analysis object storing the analysis results from tune.run
+
+        '''
+        self._resources_per_trial = resources_per_trial
+        self._set_metric(custom_metric_name, custom_metric_mode_name)
+        self._set_task()
+        self._fp16 = fp16
+        ray.init(local_mode=True)
+
+        self._set_search_space(**custom_hpo_args)
+        search_algo = self._get_search_algo(self.jobid_config.alg, self.jobid_config.arg, **custom_hpo_args)
+        scheduler = AutoScheduler.from_scheduler_name(self.jobid_config.pru)
+        self.ckpt_per_epoch = ckpt_per_epoch
+        self.path_utils.make_dir_per_run()
+
+        logger.addHandler(logging.FileHandler(os.path.join(self.path_utils.log_dir_per_run, 'tune.log')))
+        old_level = logger.getEffectiveLevel()
+        self._verbose = verbose
+        if verbose == 0:
+            logger.setLevel(logging.WARNING)
+
+        assert self.path_utils.ckpt_dir_per_run
+        start_time = time.time()
+
+        tune_config = self._search_space_hpo
+        tune_config["seed"] = self.jobid_config.sdhf
+
+        analysis = ray.tune.run(
+            self._objective,
+            metric=self.metric_name,
+            mode=self.metric_mode_name,
+            name="ray_result",
+            resources_per_trial=resources_per_trial,
+            config=tune_config,
+            verbose=verbose,
+            local_dir=self.path_utils.ckpt_dir_per_run,
+            num_samples=num_samples,
+            time_budget_s=time_budget,
+            keep_checkpoints_num=1,
+            scheduler=scheduler,
+            search_alg=search_algo,
+        )
+        duration = time.time() - start_time
+        self.last_run_duration = duration
+        logger.info("Total running time: {} seconds".format(duration))
+
+        ray.shutdown()
+
+        best_trial = analysis.get_best_trial(scope="all", metric=self.metric_name, mode=self.metric_mode_name)
+        validation_metric = {"eval_" + self.metric_name
+                             : best_trial.metric_analysis[self.metric_name][self.metric_mode_name]}
+        for x in range(len(self._all_metrics)):
+            validation_metric["eval_" + self._all_metrics[x]] \
+                = best_trial.metric_analysis[self._all_metrics[x]][self._all_modes[x]]
+
+        get_best_ckpt = analysis.get_best_checkpoint(best_trial, metric=self.metric_name, mode=self.metric_mode_name)
+        best_ckpt = AutoTransformers._recover_checkpoint(get_best_ckpt)
+
+        self._save_ckpt_json(best_ckpt)
+
+        if verbose == 0:
+            logger.setLevel(old_level)
+
+        return validation_metric, analysis
+
+    def predict(self,
+                ckpt_json_dir=None,
+                **kwargs):
+        '''Predict label for test data.
+
+        An example:
+            predictions, test_metric = autohf.predict()
+
+        Args:
+            ckpt_json_dir:
+                the checkpoint for the fine-tuned huggingface if you wish to override
+                the saved checkpoint in the training stage under self.path_utils._result_dir_per_run
+
+        Returns:
+            A numpy array of shape n * 1 - - each element is a predicted class
+            label for an instance.
+        '''
+        best_checkpoint = self._load_ckpt_json(ckpt_json_dir, **kwargs)
+        best_model = self._load_model(checkpoint_path=best_checkpoint)
+        training_args = TrainingArguments(per_device_eval_batch_size=1,
+                                          output_dir=self.path_utils.result_dir_per_run)
+        test_trainer = TrainerForAutoTransformers(best_model, training_args)
+
+        if self.jobid_config.spt == "ori":
+            try:
+                self.test_dataset.remove_columns_("label")
+            except ValueError:
+                pass
+
+        test_dataloader = test_trainer.get_test_dataloader(self.test_dataset)
+        predictions, labels, _ = test_trainer.prediction_loop(test_dataloader, description="Prediction")
+        predictions = np.squeeze(predictions) \
+            if get_default_task(self.get_full_data_name(), self.jobid_config.subdat) == "regression" \
+            else np.argmax(predictions, axis=1)
+        torch.cuda.empty_cache()
+
+        if self.jobid_config.spt == "rspt":
+            assert labels is not None
+            metric = self._get_metric_func()
+            output_metric = metric.compute(predictions=predictions, references=labels)
+            self._save_output_metric(output_metric)
+            return predictions, output_metric
+        else:
+            return predictions, None
+
+    def output_prediction(self,
+                          predictions=None,
+                          output_prediction_path=None,
+                          output_zip_file_name=None):
+        """
+            When using the original GLUE split, output the prediction on test data,
+            and prepare the .zip file for submission
+
+            Example:
+                local_archive_path = self.autohf.output_prediction(predictions,
+                                      output_prediction_path= self.console_args.data_root_dir + "result/",
+                                      output_zip_file_name=azure_save_file_name)
+
+            Args:
+                predictions:
+                    a list of predictions, which is the output of AutoTransformers.predict()
+                output_prediction_path:
+                    output path for the prediction
+                output_zip_file_name:
+                    an string, which is the name of the output zip file
+
+            Returns:
+                the path of the output .zip file
+        """
+        return auto_output_prediction(self.get_full_data_name(), output_prediction_path,
+                                      output_zip_file_name, predictions, self.train_dataset,
+                                      self._dev_name, self.jobid_config.subdat)
--- a/flaml/nlp/dataset/init.py
+++ b/flaml/nlp/dataset/init.py
--- a/flaml/nlp/dataset/dataprocess_auto.py
+++ b/flaml/nlp/dataset/dataprocess_auto.py
@@ -0,0 +1,225 @@
+from collections import OrderedDict
+from functools import partial
+
+from transformers import AutoTokenizer
+from .sentence_keys_auto import get_sentence_keys
+
+
+def inserting_sepp(sent, start, end, this_tokenizer):
+    return \
+        sent[:start].rstrip() + " " + this_tokenizer.sep_token + " " + sent[start:end] \
+        + " " + this_tokenizer.sep_token + " " + sent[end:].lstrip()
+
+
+def tokenize_superglue_copa(this_example,
+                            this_tokenizer,
+                            dataset_name,
+                            subdataset_name=None,
+                            **kwargs):
+    return None
+
+
+def tokenize_superglue_wic_gpt2(this_example,
+                                this_tokenizer,
+                                dataset_name,
+                                subdataset_name=None,
+                                **kwargs):
+    return None
+
+
+def tokenize_superglue_wic(this_example,
+                           this_tokenizer,
+                           dataset_name,
+                           subdataset_name=None,
+                           **kwargs
+                           ):
+    """
+        tokenize the data from the wic task (word-in-context dataset),
+        e.g., sentence 1: "There's a lot of trash on the bed of the river"
+        sentence 2: "I keep a glass of water next to my bed when I sleep",
+        label = False (different word senses)
+        In the superglue data, the position of the word in sentence 1 and 2 are provided
+        What this function does is to update the span position after tokenization, based on each LM's own tokenizer,
+        The key is to insert an [SEP] before and after the original sentence, then feed it into the LM's tokenizer.
+        There are two challenges:
+           (1) Each LM's tokenizations are different, e.g., in XLNet's tokenizer, the paddings are on the left'
+           (2) Some LM's tokenization would add an underline symbol before the word, e.g., "There's a lot"
+           -> [_There, _', _s, _a, _lot]
+           When underline meets special char such as '"', "'", the tokenized sequence after adding [SEP] needs to be
+           aligned with the sequence tokenized without [SEP]. We use a two pointer algorithm for the alignment
+    """
+    sent1, sent2 = this_example["sentence1"], this_example["sentence2"]
+    start1, end1 = this_example["start1"], this_example["end1"]
+    start2, end2 = this_example["start2"], this_example["end2"]
+    """
+        Add [SEP] to the sentence
+    """
+    altered_sent1 = inserting_sepp(sent1, start1, end1, this_tokenizer)
+    altered_sent2 = inserting_sepp(sent2, start2, end2, this_tokenizer)
+    input_ids_sepp = this_tokenizer(*(altered_sent1, altered_sent2),
+                                    padding="max_length",
+                                    max_length=1024,
+                                    truncation=True)["input_ids"]
+    data_pair = (sent1, sent2)
+    assert "max_seq_length" in kwargs, "max_seq_length must be provided for glue"
+    this_data = this_tokenizer(*data_pair, padding="max_length", max_length=kwargs["max_seq_length"], truncation=True)
+    input_ids = this_data["input_ids"]
+    which_sepp = 0
+
+    """
+        span_start_end: a 2x2 array:
+        * (span_start_end[0][0], span_start_end[0][1]) are the spans of the position of the word in the first sentence
+        * (span_start_end[1][0], span_start_end[1][1]) are the spans of the position of the word in the second sentence
+    """
+    span_start_end = [[-1, -1], [-1, -1]]
+
+    ptr_sepp = ptr_nosepp = 0
+    try:
+        padding_direction = this_tokenizer.padding_side
+        if padding_direction == "left":
+            padding_id = input_ids_sepp[0]
+            while input_ids_sepp[ptr_sepp] == padding_id:
+                ptr_sepp += 1
+            while input_ids[ptr_nosepp] == padding_id:
+                ptr_nosepp += 1
+    except KeyError:
+        pass
+    sep_id = this_tokenizer.convert_tokens_to_ids([this_tokenizer.sep_token])[0]
+    """
+        use two pointers to align the tokenized sequence before and after adding [SEP];
+        ptr_sepp: the pointer after adding; ptr_nosepp: the pointer without adding
+    """
+    while ptr_sepp < len(input_ids_sepp) and ptr_nosepp < len(input_ids) and \
+            input_ids_sepp[ptr_sepp] != 0 and input_ids[ptr_nosepp] != 0:
+        if input_ids_sepp[ptr_sepp] == input_ids[ptr_nosepp]:
+            ptr_sepp += 1
+            ptr_nosepp += 1
+        else:
+            if not (input_ids_sepp[ptr_sepp] == sep_id
+                    or this_tokenizer.convert_ids_to_tokens([input_ids_sepp[ptr_sepp]])[0] in ('▁', '_')):
+                break
+            if input_ids_sepp[ptr_sepp] == sep_id:
+                span_start_end[int(which_sepp / 2)][which_sepp % 2] = ptr_nosepp
+                which_sepp += 1
+                ptr_sepp += 1
+            else:
+                ptr_sepp += 1
+    """
+        max_word_span is the maximum tokens of the word
+        It is set to 16 following deberta:
+        https://github.com/microsoft/DeBERTa/blob/master/DeBERTa/apps/tasks/superglue_tasks.py#L1054
+    """
+    max_word_span = 16
+    word_indices = []
+    for idx1 in range(2):
+        if span_start_end[idx1][1] < kwargs["max_seq_length"]:
+            first_span = [x for x in range(span_start_end[idx1][0], span_start_end[idx1][1])
+                          if x < kwargs["max_seq_length"]] + [0] * (max_word_span - span_start_end[idx1][1]
+                                                                    + span_start_end[idx1][0])
+            word_indices.append(first_span)
+    this_data["word_spans"] = word_indices
+    return this_data
+
+
+def tokenize_glue(this_example,
+                  this_tokenizer,
+                  dataset_name,
+                  subdataset_name=None,
+                  **kwargs):
+    sentence_keys = get_sentence_keys(dataset_name, subdataset_name)
+
+    if len(sentence_keys) > 1:
+        sentence1_key, sentence2_key = sentence_keys[0], sentence_keys[1]
+    else:
+        sentence1_key = sentence_keys[0]
+        sentence2_key = None
+
+    data_pair = (
+        (this_example[sentence1_key],) if sentence2_key is None else (
+            this_example[sentence1_key], this_example[sentence2_key])
+    )
+    assert "max_seq_length" in kwargs, "max_seq_length must be provided for glue"
+    return this_tokenizer(*data_pair, padding="max_length", max_length=kwargs["max_seq_length"], truncation=True)
+
+
+TOKENIZER_MAPPING = OrderedDict(
+    [
+        (("glue", "rte"), tokenize_glue),
+        (("glue", "mrpc"), tokenize_glue),
+        (("glue", "cola"), tokenize_glue),
+        (("glue", "wnli"), tokenize_glue),
+        (("glue", "stsb"), tokenize_glue),
+        (("glue", "sst2"), tokenize_glue),
+        (("glue", "mnli"), tokenize_glue),
+        (("glue", "qqp"), tokenize_glue),
+        (("glue", "qnli"), tokenize_glue),
+        (("super_glue", "wic"), tokenize_superglue_wic),
+    ]
+)
+
+
+class AutoEncodeText:
+    """
+    This is a generic input text tokenization class that will be instantiated as one of the
+    tokenization classes of the library when created with the
+    `~flaml.nlp.dataset.AutoEncodeText.from_model_and_dataset_name` class method.
+
+    This class cannot be instantiated directly using ``__init__()`` (throws an error).
+    """
+
+    def __init__(self):
+        raise EnvironmentError(
+            "AutoEncodeText is designed to be instantiated "
+            "using the `AutoEncodeText.from_model_and_dataset_name(cls,"
+            "data_raw,model_checkpoint_path,dataset_name,subdataset_name = None,**kwargs)` methods."
+        )
+
+    @classmethod
+    def from_model_and_dataset_name(cls,
+                                    data_raw,
+                                    model_checkpoint_path,
+                                    dataset_name,
+                                    subdataset_name=None,
+                                    **kwargs):
+        """
+        Instantiate one of the input text tokenization classes from the raw data, model checkpoint path, dataset name
+        and sub dataset name. The raw data is used for creating a mapping function from the raw tokens to the
+        tokenized token ids.
+
+        Args:
+            data_raw:
+                The raw data (a datasets.Dataset object)
+
+            model_checkpoint_path:
+                A string variable which specifies the model path, e.g., "google/electra-base-discriminator"
+
+            dataset_name:
+                A string variable which is the dataset name, e.g., "glue"
+
+            subdataset_name:
+                A string variable which is the sub dataset name,e.g., "rte"
+
+            kwargs:
+                The values in kwargs of any keys will be used for the mapping function
+
+        Examples:
+            >>> from datasets import load_dataset
+            >>> data_raw = load_dataset("glue", "rte")
+            >>> AutoEncodeText.from_model_and_dataset_name(data_raw, "google/electra-base-discriminator", ["glue"], "rte")
+
+        """
+        if (dataset_name, subdataset_name) in TOKENIZER_MAPPING.keys():
+            this_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_path, use_fast=True)
+            token_func = TOKENIZER_MAPPING[(dataset_name, subdataset_name)]
+            return data_raw.map(
+                partial(token_func,
+                        this_tokenizer=this_tokenizer,
+                        dataset_name=dataset_name,
+                        subdataset_name=subdataset_name,
+                        **kwargs), batched=False)
+        raise ValueError(
+            "Unrecognized method {},{} for this kind of AutoGridSearchSpace: {}.\n"
+            "Method name should be one of {}.".format(
+                dataset_name, subdataset_name, cls.__name__, ", ".join(c.__name__ for c in TOKENIZER_MAPPING.keys())
+            )
+        )
--- a/flaml/nlp/dataset/metric_auto.py
+++ b/flaml/nlp/dataset/metric_auto.py
@@ -0,0 +1,70 @@
+# https://github.com/huggingface/datasets/blob/master/metrics/glue/glue.py
+from collections import OrderedDict
+
+metric_mode_mapping_glue = {
+    "cola": [("matthews_correlation", "max")],
+    "mnli": [("accuracy", "max")],
+    "mrpc": [("accuracy", "max"), ("f1", "max")],
+    "qnli": [("accuracy", "max")],
+    "qqp": [("accuracy", "max"), ("f1", "max")],
+    "rte": [("accuracy", "max")],
+    "sst2": [("accuracy", "max")],
+    "stsb": [("pearson", "max"), ("spearmanr", "max")],
+    "wnli": [("accuracy", "max")]
+}
+
+metric_mode_mapping_squad = [("exact_match", "max"), ("f1", "max")]
+
+metric_mode_mapping_super_glue = {
+    "axb": [("matthews_correlation", "max")],
+    "cb": [("accuracy", "max"), ("f1", "max")],
+    "copa": [("accuracy", "max")],
+    "rte": [("accuracy", "max")],
+    "wic": [("accuracy", "max")],
+    "wsc": [("accuracy", "max")],
+    "wsc.fixed": [("accuracy", "max")],
+    "boolq": [("accuracy", "max")],
+    "axg": [("accuracy", "max")]
+}
+
+metric_mode_mapping_imdb = [("accuracy", "max")]
+
+metric_mode_mapping_yelp = [("accuracy", "max")]
+
+METRIC_MAPPING = OrderedDict(
+    [
+        ("squad", metric_mode_mapping_squad),
+        ("glue", metric_mode_mapping_glue),
+        ("super_glue", metric_mode_mapping_super_glue),
+        ("imdb", metric_mode_mapping_imdb),
+        ("yelp_review_full", metric_mode_mapping_yelp)
+    ]
+)
+
+
+def get_default_and_alternative_metric(dataset_name,
+                                       subdataset_name=None,
+                                       custom_metric_name=None,
+                                       custom_metric_mode_name=None):
+    if dataset_name not in METRIC_MAPPING.keys():
+        assert custom_metric_name and custom_metric_mode_name, \
+            "The dataset is not in {}, you must explicitly specify " \
+            "the custom_metric_name and custom_metric_mode_name".format(",".join(METRIC_MAPPING.keys()))
+    eval_name_mapping = METRIC_MAPPING[dataset_name]
+    if isinstance(eval_name_mapping, dict):
+        assert subdataset_name and subdataset_name in eval_name_mapping, \
+            "dataset_name and subdataset_name not correctly specified"
+        default_metric, default_mode = eval_name_mapping[subdataset_name][0]
+        all_metrics, all_mode \
+            = [x[0] for x in eval_name_mapping[subdataset_name]] \
+            + ["loss"], [x[1] for x in eval_name_mapping[subdataset_name]] + ["min"]
+
+        return default_metric, default_mode, all_metrics, all_mode
+    else:
+        assert isinstance(eval_name_mapping, list), "dataset_name and subdataset_name not correctly specified"
+
+        default_metric, default_mode = eval_name_mapping[0]
+        all_metrics, all_mode = [x[0] for x in eval_name_mapping] + ["loss"], \
+                                [x[1] for x in eval_name_mapping] + ["min"]
+
+        return default_metric, default_mode, all_metrics, all_mode
--- a/flaml/nlp/dataset/sentence_keys_auto.py
+++ b/flaml/nlp/dataset/sentence_keys_auto.py
@@ -0,0 +1,28 @@
+sentence_keys_glue = {
+    "cola": ["sentence"],
+    "mnli": ["premise", "hypothesis"],
+    "mrpc": ["sentence1", "sentence2"],
+    "qnli": ["sentence", "question"],
+    "qqp": ["question1", "question2"],
+    "rte": ["sentence1", "sentence2"],
+    "sst2": ["sentence"],
+    "stsb": ["sentence1", "sentence2"],
+    "wnli": ["sentence1", "sentence2"]
+}
+
+sentence_keys_super_glue = {
+    "rte": ["hypothesis", "premise"],
+    "wic": ["sentence1", "sentence2"],
+    "wsc": ["text"]
+}
+
+
+def get_sentence_keys(dataset_name, subdataset_name=None):
+    eval_name_mapping = globals()["sentence_keys_" + dataset_name]
+    if isinstance(eval_name_mapping, dict):
+        assert subdataset_name and subdataset_name in eval_name_mapping, \
+            "dataset_name and subdataset_name not correctly specified"
+        sentence_keys = eval_name_mapping[subdataset_name]
+    else:
+        sentence_keys = eval_name_mapping
+    return sentence_keys
--- a/flaml/nlp/dataset/submission_auto.py
+++ b/flaml/nlp/dataset/submission_auto.py
@@ -0,0 +1,126 @@
+import os
+import shutil
+from collections import OrderedDict
+
+file_name_mapping_glue = {
+    "ax": ["AX.tsv"],
+    "cola": ["CoLA.tsv"],
+    "mnli": ["MNLI-m.tsv", "MNLI-mm.tsv"],
+    "mrpc": ["MRPC.tsv"],
+    "qnli": ["QNLI.tsv"],
+    "qqp": ["QQP.tsv"],
+    "rte": ["RTE.tsv"],
+    "sst2": ["SST-2.tsv"],
+    "stsb": ["STS-B.tsv"],
+    "wnli": ["WNLI.tsv"]
+}
+
+default_prediction_glue = {
+    "ax": ["entailment"],
+    "cola": ["0"],
+    "mnli": ["neutral", "neutral"],
+    "mrpc": ["0"],
+    "qnli": ["not_entailment"],
+    "qqp": ["0"],
+    "rte": ["not_entailment"],
+    "sst2": ["0"],
+    "stsb": ["0.0"],
+    "wnli": ["0"]
+}
+
+test_size_glue = {
+    "ax": [1104],
+    "cola": [1064],
+    "mnli": [9796, 9847],
+    "mrpc": [1725],
+    "qnli": [5463],
+    "qqp": [390965],
+    "rte": [3000],
+    "sst2": [1821],
+    "stsb": [1379],
+    "wnli": [146]
+}
+
+
+def output_prediction_glue(output_path, output_dir_name, predictions, train_data, dev_name, subdataset_name):
+    output_dir = os.path.join(output_path, output_dir_name)
+    if os.path.exists(output_dir):
+        assert os.path.isdir(output_dir)
+    else:
+        os.mkdir(output_dir)
+    if subdataset_name != "stsb":
+        label_list = train_data.features["label"].names
+
+    output_blank_tsv(output_dir)
+    for each_subdataset_name in file_name_mapping_glue.keys():
+        for idx in range(len(file_name_mapping_glue[each_subdataset_name])):
+            each_file = file_name_mapping_glue[each_subdataset_name][idx]
+            if subdataset_name != "mnli":
+                is_match = subdataset_name == each_subdataset_name
+            else:
+                if dev_name == "validation_matched":
+                    is_match = each_file == "MNLI-m.tsv"
+                else:
+                    is_match = each_file == "MNLI-mm.tsv"
+            if is_match:
+                with open(os.path.join(output_dir, each_file), "w") as writer:
+                    writer.write("index\tprediction\n")
+                    for index, item in enumerate(predictions):
+                        if subdataset_name == "stsb":
+                            if item > 5.0:
+                                item = 5.0
+                            writer.write(f"{index}\t{item:3.3f}\n")
+                        else:
+                            if subdataset_name in ("rte", "qnli", "mnli"):
+                                item = label_list[item]
+                                writer.write(f"{index}\t{item}\n")
+                            else:
+                                if int(item) == item:
+                                    item = int(item)
+                                    writer.write(f"{index}\t{item}\n")
+                                else:
+                                    writer.write(f"{index}\t{item:3.3f}\n")
+
+    shutil.make_archive(os.path.join(output_path, output_dir_name), 'zip', output_dir)
+    return os.path.join(output_path, output_dir_name + ".zip")
+
+
+OUTPUT_PREDICTION_MAPPING = OrderedDict(
+    [
+        ("glue", output_prediction_glue),
+    ]
+)
+
+
+def auto_output_prediction(dataset_name,
+                           output_path,
+                           output_dir_name,
+                           predictions,
+                           train_data,
+                           dev_name,
+                           subset_name):
+    if dataset_name in OUTPUT_PREDICTION_MAPPING.keys():
+        return OUTPUT_PREDICTION_MAPPING[dataset_name](output_path,
+                                                       output_dir_name,
+                                                       predictions,
+                                                       train_data,
+                                                       dev_name,
+                                                       subset_name)
+    else:
+        raise ValueError(
+            "Unrecognized dataset {}. \n"
+            "Should be one of {}.".format(dataset_name, ", ".join(c.__name__ for c in OUTPUT_PREDICTION_MAPPING.keys())
+                                          )
+        )
+
+
+def output_blank_tsv(output_dir):
+    for each_subdataset_name in file_name_mapping_glue.keys():
+        for idx in range(len(file_name_mapping_glue[each_subdataset_name])):
+            each_file = file_name_mapping_glue[each_subdataset_name][idx]
+            default_prediction = default_prediction_glue[each_subdataset_name][idx]
+            test_size = test_size_glue[each_subdataset_name][idx]
+            with open(os.path.join(output_dir, each_file), "w") as writer:
+                writer.write("index\tprediction\n")
+                for index in range(test_size):
+                    writer.write(f"{index}\t{default_prediction}\n")
--- a/flaml/nlp/dataset/task_auto.py
+++ b/flaml/nlp/dataset/task_auto.py
@@ -0,0 +1,45 @@
+# https://github.com/huggingface/datasets/blob/master/metrics/glue/glue.py
+
+from collections import OrderedDict
+
+task_mapping_glue = {
+    "cola": "seq-classification",
+    "mnli": "seq-classification",
+    "mrpc": "seq-classification",
+    "qnli": "seq-classification",
+    "qqp": "seq-classification",
+    "rte": "seq-classification",
+    "sst2": "seq-classification",
+    "stsb": "regression",
+    "wnli": "seq-classification"
+}
+
+task_mapping_squad = "question-answering"
+
+task_mapping_super_glue = {
+    "wic": "seq-classification",
+    "rte": "seq-classification"
+}
+
+TASK_MAPPING = OrderedDict(
+    [
+        ("squad", task_mapping_squad),
+        ("glue", task_mapping_glue),
+        ("super_glue", task_mapping_super_glue),
+    ]
+)
+
+
+def get_default_task(dataset_name, subdataset_name=None):
+    assert dataset_name in TASK_MAPPING.keys(), "The dataset is not in {}, you must explicitly specify " \
+                                                "the custom_metric_name and custom_metric_mode_name".format(
+        ",".join(TASK_MAPPING.keys()))
+    eval_name_mapping = TASK_MAPPING[dataset_name]
+    if isinstance(eval_name_mapping, dict):
+        assert subdataset_name and subdataset_name in eval_name_mapping, \
+            "dataset_name and subdataset_name not correctly specified"
+        default_task = eval_name_mapping[subdataset_name]
+    else:
+        assert isinstance(eval_name_mapping, list), "dataset_name and subdataset_name not correctly specified"
+        default_task = eval_name_mapping
+    return default_task
--- a/flaml/nlp/hpo/init.py
+++ b/flaml/nlp/hpo/init.py
--- a/flaml/nlp/hpo/get_grid_search_space.py
+++ b/flaml/nlp/hpo/get_grid_search_space.py
@@ -0,0 +1,456 @@
+# lookup table for the grid configs in each pre-trained language huggingface for different tasks
+import copy
+
+
+def get_space_union_and_unique(search_space_common, search_space_unique, this_case_tags: list):
+    """
+        get the recommended search configs for each pre-trained language models
+
+        Args:
+            search_space_common:
+                the union of configs recommended by the LM for all cases;
+            search_space_unique:
+                the recommended config by the LM for a specific condition, e.g., small model
+            this_case_tags:
+                a list, which contains the tags describing the specific condition, e.g., ["small"]
+    """
+    search_space_union = search_space_common.copy()
+    this_search_space = search_space_common.copy()
+    # enumerate over each case where the search space is different
+    # this difference can be the dataset or model size, etc.
+    is_included = False
+    from ..utils import merge_dicts
+    for each_case in search_space_unique.keys():
+        from ..utils import _check_dict_keys_overlaps
+        if each_case in this_case_tags:
+            is_included = True
+            assert not _check_dict_keys_overlaps(this_search_space, search_space_unique[each_case]), \
+                "the hyperparameters of common and unique search spaces should not have overlaps"
+            this_search_space.update(search_space_unique[each_case])
+        search_space_union = merge_dicts(search_space_union, search_space_unique[each_case])
+    if is_included:
+        return this_search_space
+    else:
+        if "other" in search_space_unique.keys():
+            search_space_union = merge_dicts(search_space_union, search_space_unique["other"])
+        return search_space_union
+
+
+def get_deberta_space(model_size_type=None,
+                      dataset_name=None,
+                      subdataset_name=None,
+                      algo_mode=None):
+    """
+        DEBERTA: DECODING-ENHANCED BERT WITH DISENTANGLED ATTENTION: Table 9
+        https://arxiv.org/abs/2006.03654
+    """
+    search_space_common = {
+        "cls_dropout": [0, 0.1, 0.15],
+        "warmup_steps": [50, 100, 500, 1000],
+        "per_device_train_batch_size": [16, 32, 48, 64],
+        "num_train_epochs": [10],
+        "adam_epsilon": [1e-6],
+    }
+    search_space_unique = {
+        "large": {
+            "learning_rate": [5e-6, 8e-6, 9e-6, 1e-5],
+            "weight_decay": [0.01],
+        },
+        "base": {
+            "learning_rate": [1.5e-5, 2e-5, 3e-5, 4e-5],
+        }
+    }
+    return get_space_union_and_unique(search_space_common, search_space_unique, [model_size_type])
+
+
+def get_longformer_space(model_size_type=None,
+                         dataset_name=None,
+                         subdataset_name=None,
+                         algo_mode=None):
+    """
+        TODO: Longformer: The Long-Document Transformer
+    """
+    if dataset_name == "glue":
+        return
+
+
+def get_funnel_space(model_size_type=None,
+                     dataset_name=None,
+                     subdataset_name=None,
+                     algo_mode=None):
+    """
+    Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing
+    https://arxiv.org/abs/2006.03236
+    """
+    search_space_common = {"learning_rate": [1e-5, 2e-5, 3e-5],
+                           "hidden_dropout": [0.1],
+                           "activation_dropout": [0.0],
+                           "attention_dropout": [0.1],
+                           "weight_decay": [0.01],
+                           "warmup_ratio": [0.1],
+                           "adam_epsilon": [1e-6],
+                           }
+    search_space_unique = {
+        "imdb": {
+            "per_device_train_batch_size": [32],
+            "num_train_epochs": [5]
+        },
+        "ag_news": {
+            "per_device_train_batch_size": [32],
+            "num_train_epochs": [3]
+        },
+        "dbpedia_14": {
+            "per_device_train_batch_size": [64],
+            "num_train_epochs": [3]
+        },
+        "yelp_polarity": {
+            "per_device_train_batch_size": [128],
+            "num_train_epochs": [3]
+        },
+        "yelp_review_full": {
+            "per_device_train_batch_size": [128],
+            "num_train_epochs": [3]
+        },
+        "amazon_polarity": {
+            "per_device_train_batch_size": [128],
+            "num_train_epochs": [3]
+        },
+        "amazon_review_multi": {
+            "per_device_train_batch_size": [128],
+            "num_train_epochs": [3]
+        },
+        "glue_rte": {
+            "per_device_train_batch_size": [16],
+            "num_train_epochs": [10]
+        },
+        "glue_mrpc": {
+            "per_device_train_batch_size": [16],
+            "num_train_epochs": [10]
+        },
+        "glue_stsb": {
+            "per_device_train_batch_size": [16],
+            "num_train_epochs": [10]
+        },
+        "glue_cola": {
+            "per_device_train_batch_size": [16],
+            "num_train_epochs": [10]
+        },
+        "glue_sst2": {
+            "per_device_train_batch_size": [32],
+            "num_train_epochs": [5]
+        },
+        "glue_qnli": {
+            "per_device_train_batch_size": [32],
+            "num_train_epochs": [3]
+        },
+        "glue_mnli": {
+            "per_device_train_batch_size": [64],
+            "num_train_epochs": [3]
+        },
+        "glue_qqp": {
+            "per_device_train_batch_size": [64],
+            "num_train_epochs": [5]
+        }
+    }
+    from ..result_analysis.azure_utils import JobID
+    return get_space_union_and_unique(search_space_common, search_space_unique,
+                                      [JobID.get_full_data_name(dataset_name, subdataset_name)])
+
+
+def get_bert_space(model_size_type=None,
+                   dataset_name=None,
+                   subdataset_name=None,
+                   algo_mode=None):
+    """
+        BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
+        https://arxiv.org/pdf/1810.04805.pdf
+    """
+    search_space_common = {}
+    search_space_unique = {
+        # Section 4.1: We use a batch size of 32 and fine-tune for 3 epochs over the data for all GLUE tasks. For each
+        # task, we selected the best fine-tuning learning rate (among 5e-5, 4e-5, 3e-5, and 2e-5) on the Dev set
+        "glue": {
+            "learning_rate": [5e-5, 4e-5, 3e-5, 2e-5],
+            "per_device_train_batch_size": [32],
+            "num_train_epochs": [3],
+        },
+        # Section 4.2: We fine-tune for 3 epochs with a learning rate of 5e-5 and a batch size of 32
+        "squad": {
+            "learning_rate": [5e-5],
+            "per_device_train_batch_size": [32],
+            "num_train_epochs": [2],
+        },
+        # Section 4.3: We fine-tuned for 2 epochs with a learning rate of 5e-5 and a batch size of 48.
+        "squad_v2": {
+            "learning_rate": [5e-5],
+            "per_device_train_batch_size": [48],
+            "num_train_epochs": [2],
+        },
+        # Section 4.4: We fine-tune the huggingface for 3 epochs with a learning rate of 2e-5 and a batch size of 16.
+        "swag": {
+            "learning_rate": [2e-5],
+            "per_device_train_batch_size": [16],
+            "num_train_epochs": [3],
+        },
+        # Appedix A. The optimal hyperparameter values are task-specific, but we found the following
+        # range of possible values to work well across all tasks:
+        # - Batch size: 16, 32
+        # - Learning rate (Adam): 5e-5, 3e-5, 2e-5
+        # - Number of epochs: 2, 3, 4
+        "other": {
+            "learning_rate": [5e-5, 3e-5, 2e-5],
+            "per_device_train_batch_size": [16, 32],
+            "num_train_epochs": [2, 3, 4],
+        }
+    }
+    return get_space_union_and_unique(search_space_common, search_space_unique, [dataset_name])
+
+
+def get_roberta_space(model_size_type=None,
+                      dataset_name=None,
+                      subdataset_name=None,
+                      algo_mode=None):
+    # RoBERTa: A Robustly Optimized BERT Pretraining Approach
+    # https://arxiv.org/pdf/1907.11692.pdf
+    search_space_common = {
+        "warmup_ratio": [0.06],
+    }
+    search_space_unique = {
+        # Table 10: Hyperparameters for finetuning RoBERTa-LARGE on RACE, SQuAD and GLUE.
+        # We consider a limited hyperparameter
+        # sweep for each task, with batch sizes ∈ {16, 32}
+        # and learning rates ∈ {1e−5, 2e−5, 3e−5}, with a
+        # linear warmup for the first 6% of steps followed by
+        # a linear decay to 0.
+        "glue": {
+            "learning_rate": [1e-5, 2e-5, 3e-5],
+            "per_device_train_batch_size": [16, 32],
+            "weight_decay": [0.1],
+            "num_train_epochs": [10],
+        },
+        "race": {
+            "learning_rate": [1e-5],
+            "per_device_train_batch_size": [16],
+            "weight_decay": [0.1],
+            "num_train_epochs": [4],
+        },
+        "squad": {
+            "learning_rate": [1.5e-5],
+            "per_device_train_batch_size": [48],
+            "weight_decay": [0.01],
+            "num_train_epochs": [2],
+        }
+    }
+    return get_space_union_and_unique(search_space_common, search_space_unique, [dataset_name])
+
+
+def get_electra_space(model_size_type=None,
+                      dataset_name=None,
+                      subdataset_name=None,
+                      algo_mode=None):
+    """
+        ELECTRA: PRE-TRAINING TEXT ENCODERS AS DISCRIMINATORS RATHER THAN GENERATORS
+        https://arxiv.org/pdf/2003.10555.pdf
+    """
+    assert model_size_type in ("small", "base", "large", "intermediate", "xlarge"), \
+        "Electra paper has only provided hyperparameter for the small and base huggingface"
+    search_space_common = {
+        "learning_rate": [3e-5, 5e-5, 1e-4, 1.5e-4] if algo_mode == "grid"
+        else [3e-5, 5e-5, 1e-4, 1.5e-4, 2e-4, 3e-4, 5e-3],
+        "weight_decay": [0.0],
+        "adam_epsilon": [1e-6],
+        "warmup_ratio": [0.1],
+        "per_device_train_batch_size": [32],
+        "hidden_dropout_prob": [0.1],
+        "attention_probs_dropout_prob": [0.1],
+    }
+    search_space_unique = {
+        # Appendix B: For Basesized models we searched for a learning
+        "squad": {
+            "num_train_epochs": [2]
+        },
+        "squad_v2": {
+            "num_train_epochs": [2]
+        },
+        "glue_stsb": {
+            "num_train_epochs": [10],
+        },
+        "glue_rte": {
+            "num_train_epochs": [10],
+        },
+        "glue_wnli": {
+            "num_train_epochs": [3],
+        },
+        "glue_mrpc": {
+            "num_train_epochs": [3],
+        },
+        "glue_cola": {
+            "num_train_epochs": [3],
+        },
+        "glue_sst2": {
+            "num_train_epochs": [3],
+        },
+        "glue_qnli": {
+            "num_train_epochs": [3],
+        },
+        "glue_mnli": {
+            "num_train_epochs": [3],
+        },
+        "glue_qqp": {
+            "num_train_epochs": [3],
+        }
+    }
+    from ..result_analysis.azure_utils import JobID
+    return get_space_union_and_unique(search_space_common, search_space_unique,
+                                      [JobID.get_full_data_name(dataset_name, subdataset_name), model_size_type])
+
+
+def get_mobilebert_space(model_size_type=None,
+                         dataset_name=None,
+                         subdataset_name=None,
+                         algo_mode=None):
+    """
+        MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices
+        https://arxiv.org/pdf/2004.02984.pdf
+    """
+    # To finetune the pre-trained models, we search the optimization hyperparameters
+    # in a search space including different batch sizes (16/32/48), learning
+    # rates ((1-10) * e-5), and the number of epochs (2-10)
+    search_space_common = {
+        "learning_rate": [x * 1e-5 for x in range(1, 11)],
+        "per_device_train_batch_size": [4, 8, 16, 32, 48],
+        "num_train_epochs": [x for x in range(2, 11)],
+    }
+    search_space_unique = {}
+    return get_space_union_and_unique(search_space_common, search_space_unique, [])
+
+
+def get_albert_space(model_size_type=None,
+                     dataset_name=None,
+                     subdataset_name=None,
+                     algo_mode=None):
+    """
+        Hyperparameters for downstream tasks are shown in Table 14. We adapt these hyperparameters
+        from Liu et al. (2019), Devlin et al. (2019), and Yang et al. (2019).
+
+        LR BSZ ALBERT DR Classifier DR TS WS MSL
+        CoLA 1.00E-05 16 0 0.1 5336 320 512
+        STS 2.00E-05 16 0 0.1 3598 214 512
+        SST-2 1.00E-05 32 0 0.1 20935 1256 512
+        MNLI 3.00E-05 128 0 0.1 10000 1000 512
+        QNLI 1.00E-05 32 0 0.1 33112 1986 512
+        QQP 5.00E-05 128 0.1 0.1 14000 1000 512
+        RTE 3.00E-05 32 0.1 0.1 800 200 512
+        MRPC 2.00E-05 32 0 0.1 800 200 512
+        WNLI 2.00E-05 16 0.1 0.1 2000 250 512
+        SQuAD v1.1 5.00E-05 48 0 0.1 3649 365 384
+        SQuAD v2.0 3.00E-05 48 0 0.1 8144 814 512
+        RACE 2.00E-05 32 0.1 0.1 12000 1000 512
+    """
+    search_space_common = {
+    }
+    search_space_unique = {
+        "glue_cola": {
+            "learning_rate": [1e-5],
+            "per_device_train_batch_size": [16],
+            "attention_probs_dropout_prob": [0],
+            "classifier_dropout_prob": [0.1],
+            "max_steps": [5336],
+            "warmup_steps": [320],
+        },
+        "glue_stsb": {
+            "learning_rate": [2e-5],
+            "per_device_train_batch_size": [16],
+            "attention_probs_dropout_prob": [0],
+            "classifier_dropout_prob": [0.1],
+            "max_steps": [3598],
+            "warmup_steps": [214],
+        },
+        "glue_sst2": {
+            "learning_rate": [1e-5],
+            "per_device_train_batch_size": [32],
+            "attention_probs_dropout_prob": [0],
+            "classifier_dropout_prob": [0.1],
+            "max_steps": [20935],
+            "warmup_steps": [1256],
+        },
+        "glue_mnli": {
+            "learning_rate": [3e-5],
+            "per_device_train_batch_size": [128],
+            "attention_probs_dropout_prob": [0],
+            "classifier_dropout_prob": [0.1],
+            "max_steps": [10000],
+            "warmup_steps": [1000],
+        },
+        "glue_qnli": {
+            "learning_rate": [1e-5],
+            "per_device_train_batch_size": [32],
+            "attention_probs_dropout_prob": [0],
+            "classifier_dropout_prob": [0.1],
+            "max_steps": [33112],
+            "warmup_steps": [1986],
+        },
+        "glue_qqp": {
+            "learning_rate": [5e-5],
+            "per_device_train_batch_size": [128],
+            "attention_probs_dropout_prob": [0.1],
+            "classifier_dropout_prob": [0.1],
+            "max_steps": [14000],
+            "warmup_steps": [1000],
+        },
+        "glue_rte": {
+            "learning_rate": [3e-5],
+            "per_device_train_batch_size": [32],
+            "attention_probs_dropout_prob": [0.1],
+            "classifier_dropout_prob": [0.1],
+            "max_steps": [800],
+            "warmup_steps": [200],
+        },
+        "glue_mrpc": {
+            "learning_rate": [2e-5],
+            "per_device_train_batch_size": [32],
+            "attention_probs_dropout_prob": [0],
+            "classifier_dropout_prob": [0.1],
+            "max_steps": [800],
+            "warmup_steps": [200],
+        },
+        "glue_wnli": {
+            "learning_rate": [2e-5],
+            "per_device_train_batch_size": [16],
+            "attention_probs_dropout_prob": [0.1],
+            "classifier_dropout_prob": [0.1],
+            "max_steps": [2000],
+            "warmup_steps": [250],
+        },
+        "squad": {
+            "learning_rate": [5e-5],
+            "per_device_train_batch_size": [48],
+            "attention_probs_dropout_prob": [0],
+            "classifier_dropout_prob": [0.1],
+            "max_steps": [3649],
+            "warmup_steps": [365],
+        },
+        "squad_v2": {
+            "learning_rate": [3e-5],
+            "per_device_train_batch_size": [48],
+            "attention_probs_dropout_prob": [0],
+            "classifier_dropout_prob": [0.1],
+            "max_steps": [8144],
+            "warmup_steps": [814],
+        },
+        "race": {
+            "learning_rate": [2e-5],
+            "per_device_train_batch_size": [32],
+            "attention_probs_dropout_prob": [0.1],
+            "classifier_dropout_prob": [0.1],
+            "max_steps": [12000],
+            "warmup_steps": [1000],
+        },
+    }
+
+    # To finetune the pre-trained models, we search the optimization hyperparameters
+    # in a search space including different batch sizes (16/32/48), learning
+    # rates ((1-10) * e-5), and the number of epochs (2-10)
+    from ..result_analysis.azure_utils import JobID
+    return get_space_union_and_unique(search_space_common, search_space_unique,
+                                      [JobID.get_full_data_name(dataset_name, subdataset_name)])
--- a/flaml/nlp/hpo/grid_searchspace_auto.py
+++ b/flaml/nlp/hpo/grid_searchspace_auto.py
@@ -0,0 +1,93 @@
+from collections import OrderedDict
+
+from .get_grid_search_space import \
+    (get_electra_space,
+     get_bert_space,
+     get_roberta_space,
+     get_funnel_space,
+     get_deberta_space,
+     get_albert_space
+     )
+
+GRID_SEARCH_SPACE_MAPPING = OrderedDict(
+    [
+        ("electra", get_electra_space),
+        ("bert", get_bert_space),
+        ("roberta", get_roberta_space),
+        ("funnel", get_funnel_space),
+        ("deberta", get_deberta_space),
+        ("albert", get_albert_space),
+    ]
+)
+
+HF_MODEL_LIST = [
+    "bert",
+    "roberta",
+    "electra",
+    "xlnet",
+    "albert",
+    "distilbert",
+    "deberta",
+    "mobilebert",
+    "funnel"
+]
+
+
+class AutoGridSearchSpace:
+    """
+    This is a class for getting the recommended grid search space of a pre-trained LM that will be
+    instantiated as one of the search spaces of the library when created with the
+    `~flaml.nlp.hpo.AutoGridSearchSpace.from_model_and_dataset_name` method.
+
+    This class cannot be instantiated directly using ``__init__()`` (throws an error).
+    """
+
+    def __init__(self):
+        raise EnvironmentError(
+            "AutoGridSearchSpace is designed to be instantiated "
+            "using the `AutoGridSearchSpace.from_config_and_method_name(cls, model_type, model_size_type,"
+            "dataset_name,subdataset_name = None,algo_mode = None)` methods."
+        )
+
+    @classmethod
+    def from_model_and_dataset_name(cls,
+                                    model_type,
+                                    model_size_type,
+                                    dataset_name,
+                                    subdataset_name=None,
+                                    algo_mode=None):
+        """
+        Instantiate one of the classes for getting the recommended grid search space of a pre-trained LM from
+        the model type, model size type, dataset name, sub dataset name and algorithm mode
+
+        Args:
+            model_type:
+                A string variable which is the model type, e.g. "electra"
+
+            model_size_type:
+                A string variable which is the size of the model, e.g., "small"
+
+            dataset_name:
+                A string variable which is the dataset name, e.g., "glue"
+
+            subdataset_name:
+                A string variable which is the sub dataset name,e.g., "rte"
+
+            algo_mode:
+                A string variable which is the algorithm mode for grid search, e.g., "gridbert"
+
+        Example:
+            >>> AutoGridSearchSpace.from_model_and_dataset_name("electra", "small", "glue", "rte", "grid")
+
+        """
+
+        if model_type in GRID_SEARCH_SPACE_MAPPING.keys():
+            this_model_recommended_space = GRID_SEARCH_SPACE_MAPPING[model_type](
+                model_size_type, dataset_name, subdataset_name, algo_mode)
+            return this_model_recommended_space
+        raise ValueError(
+            "Unrecognized method {},{} for this kind of AutoGridSearchSpace: {}.\n"
+            "Method name should be one of {}.".format(
+                model_type, dataset_name, cls.__name__, ", ".join(c.__name__ for c in GRID_SEARCH_SPACE_MAPPING.keys())
+            )
+        )
--- a/flaml/nlp/hpo/hpo_searchspace.py
+++ b/flaml/nlp/hpo/hpo_searchspace.py
@@ -0,0 +1,242 @@
+from collections import OrderedDict
+
+from ..huggingface.trainer import TrainerForAutoTransformers
+from ray import tune
+from transformers import TrainingArguments
+
+from .grid_searchspace_auto import AutoGridSearchSpace
+
+
+def hpo_space_custom(**custom_hpo_args):
+    assert "hpo_space" in custom_hpo_args
+    custom_search_space = custom_hpo_args["hpo_space"]
+    return custom_search_space
+
+
+def bounded_gridunion(logger=None,
+                      model_type=None,
+                      model_size_type=None,
+                      dataset_name=None,
+                      subdataset_name=None,
+                      **custom_hpo_args):
+    assert "bound" in custom_hpo_args
+    gridunion_space = HPO_SEARCH_SPACE_MAPPING["uni"](logger,
+                                                      model_type,
+                                                      model_size_type,
+                                                      dataset_name,
+                                                      subdataset_name,
+                                                      **custom_hpo_args)
+    for each_key in custom_hpo_args["bound"].keys():
+        if "u" in custom_hpo_args["bound"][each_key]:
+            upper = custom_hpo_args["bound"][each_key]["u"]
+        else:
+            upper = 100000
+        if "l" in custom_hpo_args["bound"][each_key]:
+            lower = custom_hpo_args["bound"][each_key]["l"]
+        else:
+            lower = -100000
+        original_space = sorted(gridunion_space[each_key])
+        upper_id = len(original_space)
+        for x in range(len(original_space)):
+            if original_space[x] > upper:
+                upper_id = x
+                break
+        lower_id = 0
+        for x in range(len(original_space) - 1, -1, -1):
+            if original_space[x] < lower:
+                lower_id = x
+                break
+        gridunion_space[each_key] = original_space[lower_id:upper_id]
+    return gridunion_space
+
+
+def hpo_space_gridunion(logger=None,
+                        model_type=None,
+                        model_size_type=None,
+                        dataset_name=None,
+                        subdataset_name=None,
+                        **custom_hpo_args):
+    output_config = {}
+    for each_model_type in {"electra", "roberta", "bert"}:
+        # if each_model_type == model_type: continue
+        this_config = AutoGridSearchSpace.from_model_and_dataset_name(
+            each_model_type, model_size_type, dataset_name, subdataset_name, "hpo")
+        from ..utils import merge_dicts
+        output_config = merge_dicts(output_config, this_config)
+        default_values = {}
+        """
+        adding the default configuration from transformers/training_args.py into hpo space
+        """
+        training_args = TrainingArguments(output_dir=".")
+        for each_hp in output_config.keys():
+            try:
+                default_values[each_hp] = [getattr(training_args, each_hp)]
+            except AttributeError:
+                pass
+
+        output_config = merge_dicts(output_config, default_values)
+
+    return output_config
+
+
+def hpo_space_gridunion_smoke_test(
+        logger=None,
+        model_type=None,
+        model_size_type=None,
+        dataset_name=None,
+        subdataset_name=None,
+        **custom_hpo_args):
+    return {'learning_rate': [1e-5],
+            'weight_decay': [0.0],
+            'adam_epsilon': [1e-08],
+            'warmup_ratio': [0.1],
+            'per_device_train_batch_size': [2],
+            'hidden_dropout_prob': [0.1],
+            'attention_probs_dropout_prob': [0.1],
+            'num_train_epochs': [0.1]}
+
+
+def hpo_space_generic(logger=None,
+                      model_type=None,
+                      model_size_type=None,
+                      dataset_name=None,
+                      subdataset_name=None,
+                      **custom_hpo_args):
+    output_config = {
+        "learning_rate": {"l": 1e-6, "u": 1e-3, "space": "log"},
+        "num_train_epochs": {"l": 1.0, "u": 10.0, "space": "log"},
+        "per_device_train_batch_size": [4, 8, 16, 32, 48],
+        "warmup_ratio": {"l": 0.0, "u": 0.3, "space": "linear"},
+        "weight_decay": {"l": 0.0, "u": 0.3, "space": "linear"}
+    }
+    return output_config
+
+
+def hpo_space_generic_grid(logger=None,
+                           model_type=None,
+                           model_size_type=None,
+                           dataset_name=None,
+                           subdataset_name=None,
+                           **custom_hpo_args):
+    output_config = {
+        "learning_rate": [1e-5, 2e-5, 3e-5, 4e-5, 5e-5, 1e-4, 1.5e-4],
+        "num_train_epochs": [3, 10],
+        "per_device_train_batch_size": [16, 32],
+        "warmup_ratio": [0, 0.06, 0.1],
+        "weight_decay": [0, 0.1]
+    }
+    return output_config
+
+
+def hpo_space_small(logger=None,
+                    model_type=None,
+                    model_size_type=None,
+                    dataset_name=None,
+                    subdataset_name=None,
+                    **custom_hpo_args):
+    config_json = AutoGridSearchSpace.from_model_and_dataset_name(
+        model_type, model_size_type, dataset_name, subdataset_name, "hpo")
+    output_config = {}
+
+    for each_hp in config_json.keys():
+        if each_hp == "learning_rate":
+            if len(config_json[each_hp]) > 1:
+                output_config[each_hp] = {"l": 3e-5, "u": 1.5e-4, "space": "log"}
+            else:
+                output_config[each_hp] = config_json[each_hp]
+        elif each_hp == "num_train_epochs":
+            output_config[each_hp] = {"l": 2.0, "u": 4.0, "space": "linear"}
+        elif each_hp == "per_device_train_batch_size":
+            output_config[each_hp] = [16, 32, 64]
+        elif each_hp == "warmup_ratio":
+            output_config[each_hp] = {"l": 0.0, "u": 0.2, "space": "linear"}
+        elif each_hp == "weight_decay":
+            output_config[each_hp] = {"l": 0.0, "u": 0.3, "space": "linear"}
+        else:
+            output_config[each_hp] = config_json[each_hp]
+
+    return output_config
+
+
+HPO_SEARCH_SPACE_MAPPING = OrderedDict(
+    [
+        ("uni", hpo_space_gridunion),
+        ("gnr", hpo_space_generic),
+        ("uni_test", hpo_space_gridunion_smoke_test),
+        ("cus", hpo_space_custom),
+        ("buni", bounded_gridunion)
+    ]
+)
+
+
+class AutoHPOSearchSpace:
+    """
+    This is a class for getting the hpo search space based on the search space mode
+    (a string variable) instantiated as one of the HPO search spaces of the library when
+    created with the `~flaml.nlp.hpo.AutoHPOSearchSpace.from_model_and_dataset_name` method.
+
+    This class cannot be instantiated directly using ``__init__()`` (throws an error).
+    """
+
+    def __init__(self):
+        raise EnvironmentError(
+            "AutoHPOSearchSpace is designed to be instantiated "
+            "using the `AutoHPOSearchSpace.from_config_and_method_name(cls, logger,hpo_searchspace_name,"
+            "model_type,model_size_type,dataset_name,subdataset_name = None,**custom_hpo_args)` methods."
+        )
+
+    @classmethod
+    def from_model_and_dataset_name(cls,
+                                    logger,
+                                    hpo_searchspace_mode,
+                                    model_type,
+                                    model_size_type,
+                                    dataset_name,
+                                    subdataset_name=None,
+                                    **custom_hpo_args):
+        """
+        Instantiate one of the classes for getting the hpo search space from the search space name, model type,
+        model size type, dataset name and sub dataset name
+
+        Args:
+            logger:
+                Reference to the logger
+
+            hpo_searchspace_mode:
+                A string variable which is name of the hpo search space, e.g., "uni"
+
+            model_type:
+                A string variable which is the type of the model, e.g., "electra"
+
+            model_size_type:
+                A string variable which is the type of the model size, e.g., "small"
+
+            dataset_name:
+                A string variable which is the dataset name, e.g., "glue"
+
+            subdataset_name:
+                A string variable which is the sub dataset name,e.g., "rte"
+
+            custom_hpo_args:
+                Any additional keyword argument to be used for the function for the HPO search space
+
+        Example:
+            >>> AutoHPOSearchSpace.from_model_and_dataset_name(logger, "uni", "electra", "small", "glue", "rte")
+        """
+
+        if hpo_searchspace_mode in HPO_SEARCH_SPACE_MAPPING.keys():
+            hpo_space = HPO_SEARCH_SPACE_MAPPING[hpo_searchspace_mode](
+                logger,
+                model_type,
+                model_size_type,
+                dataset_name,
+                subdataset_name,
+                **custom_hpo_args)
+            return hpo_space
+        raise ValueError(
+            "Unrecognized method {},{} for this kind of AutoHPOSearchSpace: {}.\n"
+            "Method name should be one of {}.".format(
+                hpo_searchspace_mode, dataset_name, cls.__name__,
+                ", ".join(c.__name__ for c in HPO_SEARCH_SPACE_MAPPING.keys())
+            )
+        )
--- a/flaml/nlp/hpo/scheduler_auto.py
+++ b/flaml/nlp/hpo/scheduler_auto.py
@@ -0,0 +1,51 @@
+from collections import OrderedDict
+from ray.tune.schedulers import ASHAScheduler, HyperBandScheduler, MedianStoppingRule
+
+SCHEDULER_MAPPING = OrderedDict(
+    [
+        ("None", None),
+        ("asha", ASHAScheduler),
+        ("hb", HyperBandScheduler),
+    ]
+)
+
+
+class AutoScheduler:
+    """
+    This is a class for getting the scheduler based on the scheduler name
+    (a string variable) instantiated as one of the schedulers of the library when
+    created with the `~flaml.nlp.hpo.AutoScheduler.from_scheduler_name` method.
+
+    This class cannot be instantiated directly using ``__init__()`` (throws an error).
+    """
+
+    def __init__(self):
+        raise EnvironmentError(
+            "AutoScheduler is designed to be instantiated "
+            "using the `AutoScheduler.from_scheduler_name(cls, scheduler_name, **kwargs)` methods."
+        )
+
+    @classmethod
+    def from_scheduler_name(cls, scheduler_name, **kwargs):
+        """
+        Instantiate one of the schedulers using the scheduler names
+
+        Args:
+            scheduler_name:
+                A string variable for the scheduler name
+
+        Example:
+            >>> AutoScheduler.from_scheduler_name("asha")
+        """
+
+        if scheduler_name in SCHEDULER_MAPPING.keys():
+            try:
+                return SCHEDULER_MAPPING[scheduler_name](**kwargs)
+            except TypeError:
+                return None
+        raise ValueError(
+            "Unrecognized scheduler {} for this kind of AutoScheduler: {}.\n"
+            "Scheduler name should be one of {}.".format(
+                scheduler_name, cls.__name__, ", ".join(c.__name__ for c in SCHEDULER_MAPPING.keys())
+            )
+        )
--- a/flaml/nlp/hpo/searchalgo_auto.py
+++ b/flaml/nlp/hpo/searchalgo_auto.py
@@ -0,0 +1,182 @@
+import itertools
+from collections import OrderedDict
+
+import ray
+from ray.tune.suggest.optuna import OptunaSearch
+from flaml import CFO, BlendSearch
+
+SEARCH_ALGO_MAPPING = OrderedDict(
+    [
+        ("optuna", OptunaSearch),
+        ("cfo", CFO),
+        ("bs", BlendSearch),
+        ("grid", None),
+        ("gridbert", None),
+        ("rs", None)
+    ]
+)
+
+
+class AutoSearchAlgorithm:
+    """
+    This is a class for getting the search algorithm based on the search algorithm name
+    (a string variable) instantiated as one of the algorithms of the library when
+    created with the `~flaml.nlp.hpo.AutoSearchAlgorithm.from_method_name` method.
+
+    This class cannot be instantiated directly using ``__init__()`` (throws an error).
+    """
+
+    def __init__(self):
+        raise EnvironmentError(
+            "AutoSearchAlgorithm is designed to be instantiated "
+            "using the `AutoSearchAlgorithm.from_method_name(cls, search_algo_name, search_algo_args_mode,"
+            " hpo_search_space, **custom_hpo_args)` methods."
+        )
+
+    @classmethod
+    def from_method_name(cls, search_algo_name, search_algo_args_mode, hpo_search_space, **custom_hpo_args):
+        """
+        Instantiating one of the search algorithm classes based on the search algorithm name, search algorithm
+        argument mode, hpo search space and other keyword args
+
+        Args:
+            search_algo_name:
+                A string variable that specifies the search algorithm name, e.g., "bs"
+
+            search_algo_args_mode:
+                A string variable that specifies the mode for the search algorithm args, e.g., "dft" means
+                initializing using the default mode
+
+            hpo_search_space:
+                The hpo search space
+
+            custom_hpo_args:
+                The customized arguments for the search algorithm (specified by user)
+
+        Example:
+        >>> from flaml.nlp.hpo.hpo_searchspace import AutoHPOSearchSpace
+        >>> search_space_hpo=AutoHPOSearchSpace.from_model_and_dataset_name(logger, "uni", "electra", "small", "glue", "rte")
+        >>> search_algo = AutoSearchAlgorithm.from_method_name("bs", "cus", search_space_hpo,
+                         {"points_to_evaluate": [{"learning_rate": 1e-5, "num_train_epochs": 10}])
+        """
+
+        assert hpo_search_space, "hpo_search_space needs to be specified for calling AutoSearchAlgorithm.from_method_name"
+        if not search_algo_name:
+            search_algo_name = "grid"
+        if search_algo_name in SEARCH_ALGO_MAPPING.keys():
+            try:
+                """
+                filtering the customized args for hpo from custom_hpo_args, keep those
+                which are in the input variable name list of the constructor of
+                the algorithm, remove those which does not appear in the input variables
+                of the constructor function
+                """
+                this_search_algo_kwargs = None
+                allowed_arguments = SEARCH_ALGO_MAPPING[search_algo_name].__init__.__code__.co_varnames
+                allowed_custom_args = {key: custom_hpo_args[key] for key in custom_hpo_args.keys() if
+                                       key in allowed_arguments}
+
+                """
+                 If the search_algo_args_mode is "dft", set the args to the default args, e.g.,the default args for
+                 BlendSearch is "low_cost_partial_config": {"num_train_epochs": min_epoch,"per_device_train_batch_size"
+                 : max(hpo_search_space["per_device_train_batch_size"].categories)},
+                """
+                if search_algo_args_mode == "dft":
+                    this_search_algo_kwargs = DEFAULT_SEARCH_ALGO_ARGS_MAPPING[search_algo_name](
+                        "dft", hpo_search_space=hpo_search_space, **allowed_custom_args)
+                elif search_algo_args_mode == "cus":
+                    this_search_algo_kwargs = DEFAULT_SEARCH_ALGO_ARGS_MAPPING[search_algo_name](
+                        "cus", hpo_search_space=hpo_search_space, **allowed_custom_args)
+
+                """
+                returning the hpo algorithm with the arguments
+                """
+                return SEARCH_ALGO_MAPPING[search_algo_name](**this_search_algo_kwargs)
+            except KeyError:
+                return None
+        raise ValueError(
+            "Unrecognized method {} for this kind of AutoSearchAlgorithm: {}.\n"
+            "Method name should be one of {}.".format(
+                search_algo_name, cls.__name__, ", ".join(c.__name__ for c in SEARCH_ALGO_MAPPING.keys())
+            )
+        )
+
+    @staticmethod
+    def grid2list(grid_config):
+        key_val_list = [[(key, each_val) for each_val in val_list['grid_search']]
+                        for (key, val_list) in grid_config.items()]
+        config_list = [dict(x) for x in itertools.product(*key_val_list)]
+        return config_list
+
+
+def get_search_algo_args_optuna(search_args_mode, hpo_search_space=None, **custom_hpo_args):
+    return {}
+
+
+def default_search_algo_args_bs(search_args_mode, hpo_search_space=None, **custom_hpo_args):
+    assert hpo_search_space, "hpo_search_space needs to be specified for calling AutoSearchAlgorithm.from_method_name"
+    if "num_train_epochs" in hpo_search_space and \
+            isinstance(hpo_search_space["num_train_epochs"], ray.tune.sample.Categorical):
+        min_epoch = min(hpo_search_space["num_train_epochs"].categories)
+    else:
+        assert isinstance(hpo_search_space["num_train_epochs"], ray.tune.sample.Float)
+        min_epoch = hpo_search_space["num_train_epochs"].lower
+    default_search_algo_args = {
+        "low_cost_partial_config": {
+            "num_train_epochs": min_epoch,
+            "per_device_train_batch_size": max(hpo_search_space["per_device_train_batch_size"].categories),
+        },
+    }
+    if search_args_mode == "cus":
+        default_search_algo_args.update(custom_hpo_args)
+    return default_search_algo_args
+
+
+def experiment_search_algo_args_bs(hpo_search_space=None):
+    if "num_train_epochs" in hpo_search_space and \
+            isinstance(hpo_search_space["num_train_epochs"], ray.tune.sample.Categorical):
+        min_epoch = min(hpo_search_space["num_train_epochs"].categories)
+    else:
+        assert isinstance(hpo_search_space["num_train_epochs"], ray.tune.sample.Float)
+        min_epoch = hpo_search_space["num_train_epochs"].lower
+    default_search_algo_args = {
+        "low_cost_partial_config": {
+            "num_train_epochs": min_epoch,
+        },
+    }
+    return default_search_algo_args
+
+
+def default_search_algo_args_skopt(hpo_search_space=None):
+    return {}
+
+
+def default_search_algo_args_dragonfly(hpo_search_space=None):
+    return {}
+
+
+def default_search_algo_args_nevergrad(hpo_search_space=None):
+    return {}
+
+
+def default_search_algo_args_hyperopt(hpo_search_space=None):
+    return {}
+
+
+def default_search_algo_args_grid_search(search_args_mode, hpo_search_space=None, **custom_hpo_args):
+    return {}
+
+
+def default_search_algo_args_random_search(search_args_mode, hpo_search_space=None, **custom_hpo_args):
+    return {}
+
+
+DEFAULT_SEARCH_ALGO_ARGS_MAPPING = OrderedDict(
+    [
+        ("optuna", get_search_algo_args_optuna),
+        ("cfo", default_search_algo_args_bs),
+        ("bs", default_search_algo_args_bs),
+        ("grid", default_search_algo_args_grid_search),
+        ("gridbert", default_search_algo_args_random_search)
+    ]
+)
--- a/flaml/nlp/huggingface/switch_head_auto.py
+++ b/flaml/nlp/huggingface/switch_head_auto.py
@@ -0,0 +1,52 @@
+from collections import OrderedDict
+
+from transformers.models.electra.modeling_electra import ElectraClassificationHead
+from transformers.models.roberta.modeling_roberta import RobertaClassificationHead
+
+MODEL_CLASSIFICATION_HEAD_MAPPING = OrderedDict(
+    [
+        ("electra", ElectraClassificationHead),
+        ("roberta", RobertaClassificationHead),
+    ]
+)
+
+
+class AutoSeqClassificationHead:
+    """
+    This is a class for getting classification head class based on the name of the LM
+    instantiated as one of the ClassificationHead classes of the library when
+    created with the `~flaml.nlp.huggingface.AutoSeqClassificationHead.from_model_type_and_config` method.
+
+    This class cannot be instantiated directly using ``__init__()`` (throws an error).
+    """
+
+    def __init__(self):
+        raise EnvironmentError(
+            "AutoSeqClassificationHead is designed to be instantiated "
+            "using the `AutoSeqClassificationHead.from_model_type_and_config(cls, model_type, config)` methods."
+        )
+
+    @classmethod
+    def from_model_type_and_config(cls, model_type, config):
+        """
+        Instantiate one of the classification head classes from the mode_type and model configuration.
+
+        Args:
+            model_type:
+                A string, which desribes the model type, e.g., "electra"
+            config (:class:`~transformers.PretrainedConfig`):
+                The huggingface class of the model's configuration:
+
+        Examples::
+            >>> from transformers import AutoConfig
+            >>> model_config = AutoConfig.from_pretrained("google/electra-base-discriminator")
+            >>> AutoSeqClassificationHead.from_model_type_and_config("electra", model_config)
+        """
+        if model_type in MODEL_CLASSIFICATION_HEAD_MAPPING.keys():
+            return MODEL_CLASSIFICATION_HEAD_MAPPING[model_type](config)
+        raise ValueError(
+            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
+            "Model type should be one of {}.".format(
+                config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_CLASSIFICATION_HEAD_MAPPING.keys())
+            )
+        )
--- a/flaml/nlp/huggingface/trainer.py
+++ b/flaml/nlp/huggingface/trainer.py
@@ -0,0 +1,121 @@
+import copy
+import os
+
+import transformers
+
+from ray import tune
+import torch
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+
+transformers.logging.set_verbosity_error()
+
+
+class TrainerForAutoTransformers(transformers.Trainer):
+    """
+        Overriding transformers.Trainer.
+
+        Args:
+            huggingface (:class:`~transformers.PreTrainedModel` or :obj:`torch.nn.Module`, `optional`):
+    """
+
+    def get_optimizers(
+            self, num_training_steps
+    ):
+        self.current_optimizer, self.current_scheduler = super().get_optimizers(num_training_steps)
+        return (self.current_optimizer, self.current_scheduler)
+
+    def evaluate(self,
+                 eval_dataset=None):
+        """
+            Overriding transformers.Trainer.evaluate by saving state with save_state
+
+            Args:
+                eval_dataset:
+                    the dataset to be evaluated
+        """
+        import wandb
+        eval_dataloader = self.get_eval_dataloader(eval_dataset)
+        output = self.prediction_loop(
+            eval_dataloader, description="Evaluation")
+        self.log(output.metrics)
+
+        self.save_state()
+
+        for key in list(output.metrics.keys()):
+            if key.startswith("eval_"):
+                output.metrics[key[5:]] = output.metrics[key]
+        tune.report(**output.metrics)
+
+        return output.metrics
+
+    def save_state(self):
+        """
+                Overriding transformers.Trainer.save_state. It is only through saving
+                the states can best_trial.get_best_checkpoint return a non-empty value.
+        """
+        with tune.checkpoint_dir(step=self.state.global_step) as checkpoint_dir:
+            self.args.output_dir = checkpoint_dir
+            # This is the directory name that Huggingface requires.
+            output_dir = os.path.join(
+                self.args.output_dir,
+                f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}")
+            self.save_model(output_dir)
+            torch.save(self.optimizer.state_dict(),
+                       os.path.join(output_dir, "optimizer.pt"))
+            torch.save(self.lr_scheduler.state_dict(),
+                       os.path.join(output_dir, "scheduler.pt"))
+
+    @staticmethod
+    def convert_num_train_epochs_to_max_steps(
+            num_train_epochs: int,
+            num_train_examples: int,
+            per_device_train_batch_size: int,
+            device_count: int):
+        return int(num_train_epochs * num_train_examples / per_device_train_batch_size / device_count)
+
+    @staticmethod
+    def convert_max_steps_to_num_train_epochs(
+            max_steps: int,
+            num_train_examples: int,
+            per_device_train_batch_size: int,
+            device_count: int):
+        return float(max_steps * per_device_train_batch_size * device_count) / num_train_examples
+
+    @staticmethod
+    def convert_warmup_ratio_to_warmup_steps(
+            warmup_ratio,
+            max_steps=None,
+            num_train_epochs=None,
+            num_train_examples=None,
+            per_device_train_batch_size=None,
+            device_count=None):
+        if max_steps:
+            return int(warmup_ratio * max_steps)
+        max_steps = TrainerForAutoTransformers.convert_num_train_epochs_to_max_steps(
+            num_train_epochs,
+            num_train_examples,
+            per_device_train_batch_size,
+            device_count)
+        return int(warmup_ratio * max_steps)
+
+    @staticmethod
+    def convert_warmup_steps_to_warmup_ratio(
+            warmup_steps: int,
+            num_train_epochs: int,
+            num_train_examples: int,
+            per_device_train_batch_size: int,
+            device_count: int):
+        max_steps = TrainerForAutoTransformers.convert_num_train_epochs_to_max_steps(
+            num_train_epochs,
+            num_train_examples,
+            per_device_train_batch_size,
+            device_count)
+        return float(warmup_steps / max_steps)
+
+    @staticmethod
+    def resolve_hp_conflict(search_space_dict):
+        if "max_steps" in search_space_dict and "num_train_epochs" in search_space_dict:
+            del search_space_dict["num_train_epochs"]
+        if "warmup_ratio" in search_space_dict and "warmup_steps" in search_space_dict:
+            del search_space_dict["warmup_ratio"]
+        return search_space_dict
--- a/flaml/nlp/result_analysis/init.py
+++ b/flaml/nlp/result_analysis/init.py
--- a/flaml/nlp/result_analysis/azure_utils.py
+++ b/flaml/nlp/result_analysis/azure_utils.py
@@ -0,0 +1,677 @@
+import re
+import pathlib
+import os
+from azure.storage.blob import BlobServiceClient, ContainerClient
+from transformers import AutoConfig
+
+from ..utils import get_wandb_azure_key
+from datetime import datetime
+from dataclasses import dataclass, field
+from ..hpo.grid_searchspace_auto import HF_MODEL_LIST
+import json
+
+
+@dataclass
+class JobID:
+    dat: list = field(default=None)
+    subdat: str = field(default=None)
+    mod: str = field(default=None)
+    spa: str = field(default=None)
+    arg: str = field(default=None)
+    alg: str = field(default=None)
+    pru: str = field(default=None)
+    pre_full: str = field(default=None)
+    pre: str = field(default=None)
+    presz: str = field(default=None)
+    spt: str = field(default=None)
+    rep: int = field(default=0)
+    sddt: int = field(default=None)
+    sdhf: int = field(default=None)
+
+    def __init__(self,
+                 console_args=None):
+        if console_args:
+            self.set_jobid_from_console_args(console_args)
+
+    def set_unittest_config(self):
+        """
+            set the JobID config for unit test
+        """
+        self.dat = ["glue"]
+        self.subdat = "mrpc"
+        self.mod = "hpo"
+        self.spa = "uni_test"
+        self.arg = "dft"
+        self.alg = "bs"
+        self.pru = "None"
+        self.pre_full = "google/mobilebert-uncased"
+        self.pre = "mobilebert"
+        self.presz = "small"
+        self.spt = "rspt"
+        self.rep = 0
+        self.sddt = 43
+        self.sdhf = 42
+
+    def is_match(self, partial_jobid):
+        """
+            return a boolean variable whether the current object matches the partial jobid defined
+            in partial_jobid. For example,
+            self = JobID(dat = ['glue'],
+                            subdat = 'cola',
+                            mod = 'bestnn',
+                            spa = 'buni',
+                            arg = 'cus',
+                            alg = 'bs',
+                            pru = 'None',
+                            pre = 'funnel',
+                            presz = 'xlarge',
+                            spt = 'rspt',
+                            rep = 0,
+                            sddt = 43,
+                            sdhf = 42)
+            partial_jobid1 = JobID(dat = ['glue'],
+                                  subdat = 'cola',
+                                  mod = 'hpo')
+           partial_jobid2 = JobID(dat = ['glue'],
+                                  subdat = 'cola',
+                                  mod = 'bestnn')
+            return False for partial_jobid1 and True for partial_jobid2
+        """
+        is_not_match = False
+        for key, val in partial_jobid.__dict__.items():
+            if val is None:
+                continue
+            if getattr(self, key) != val:
+                is_not_match = True
+        return not is_not_match
+
+    def to_wandb_string(self):
+        """
+            preparing for the job ID for wandb
+        """
+        field_dict = self.__dict__
+        keytoval_str = "_".join([JobID.dataset_list_to_str(field_dict[key], key)
+                                 if type(field_dict[key]) == list
+                                 else str(field_dict[key])
+                                 for key in field_dict.keys() if not key.endswith("_full")])
+        return keytoval_str
+
+    def to_jobid_string(self):
+        """
+            convert the current JobID into a blob name string which contains all the fields
+        """
+        list_keys = list(JobID.__dataclass_fields__.keys())
+        field_dict = self.__dict__
+        keytoval_str = "_".join([key + "=" + JobID.dataset_list_to_str(field_dict[key], key)
+                                 if type(field_dict[key]) == list
+                                 else key + "=" + str(field_dict[key])
+                                 for key in list_keys if not key.endswith("_full")])
+        return keytoval_str
+
+    def to_partial_jobid_string(self):
+        """
+            convert the current JobID into a blob name string which only contains the fields whose values are not "None"
+        """
+        list_keys = list(JobID.__dataclass_fields__.keys())
+        field_dict = self.__dict__  # field_dict contains fields whose values are not None
+        keytoval_str = "_".join([key + "=" + JobID.dataset_list_to_str(field_dict[key], key)
+                                 if type(field_dict[key]) == list
+                                 else key + "=" + str(field_dict[key])
+                                 for key in list_keys if key in field_dict.keys()])
+        return keytoval_str
+
+    @staticmethod
+    def blobname_to_jobid_dict(keytoval_str):
+        """
+            converting an azure blobname to a JobID config,
+            e.g., blobname = "dat=glue_subdat=cola_mod=bestnn_spa=buni_arg=cus_
+                              alg=bs_pru=None_pre=funnel_presz=xlarge_spt=rspt_rep=0.json"
+            the converted jobid dict = {dat = ['glue'], subdat = 'cola', mod = 'bestnn',
+                                   spa = 'buni', arg = 'cus', alg = 'bs', pru = 'None',
+                                   pre = 'funnel', presz = 'xlarge', spt = 'rspt',
+                                   rep = 0, sddt = 43, sdhf = 42)
+        """
+        field_keys = [key for key in
+                      list(JobID.__dataclass_fields__.keys()) if not key.endswith("_full")]
+        regex_expression = ".*" + "_".join([key + "=(?P<" + key + ">.*)" for key in field_keys]) + ".(json|zip)"
+        result = re.search(regex_expression, keytoval_str)
+        if result:
+            result_dict = {}
+            for key in field_keys:
+                if key == "dat":
+                    result_dict[key] = [result.group(key)]
+                elif key == "rep":
+                    try:
+                        result_dict[key] = int(result.group(key))
+                    except IndexError:
+                        result_dict[key] = -1
+                else:
+                    result_dict[key] = result.group(key)
+            return result_dict
+        else:
+            return None
+
+    @staticmethod
+    def dataset_list_to_str(dataset_name, key):
+        if key == "dat":
+            assert isinstance(dataset_name, list)
+            return "-".join(dataset_name)
+        else:
+            return dataset_name
+
+    @staticmethod
+    def set_jobid_from_arg_list(self,
+                                **jobid_list
+                                ):
+        """
+            set the jobid from a dict object
+        """
+
+        for key in jobid_list.keys():
+            assert key in JobID.__dataclass_fields__.keys()
+            setattr(self, key, jobid_list[key])
+
+    @staticmethod
+    def convert_blobname_to_jobid(blobname):
+        """
+            converting a blobname string to a JobID object
+        """
+        jobconfig_dict = JobID.blobname_to_jobid_dict(blobname)
+        if jobconfig_dict:
+            jobconfig = JobID()
+            jobconfig.set_jobid_from_arg_list(**jobconfig_dict)
+            return jobconfig
+        else:
+            return None
+
+    @staticmethod
+    def get_full_data_name(dataset_name, subdataset_name=None):
+        """
+            convert a dataset name and sub dataset name to a full dataset name
+        """
+        full_dataset_name = dataset_name
+        if subdataset_name:
+            full_dataset_name = full_dataset_name + "_" + subdataset_name
+        return full_dataset_name
+
+    def get_jobid_full_data_name(self):
+        """
+            get the full dataset name of the current JobID object
+        """
+        return JobID.get_full_data_name(JobID.dataset_list_to_str(self.dat, "dat"), self.subdat)
+
+    @staticmethod
+    def _extract_model_type_with_keywords_match(pre_full):
+        matched_model_type = []
+        for each_model_type in HF_MODEL_LIST:
+            if each_model_type in pre_full:
+                matched_model_type.append(each_model_type)
+        assert len(matched_model_type) > 0
+        return max(enumerate(matched_model_type), key=lambda x: len(x[1]))[1]
+
+    @staticmethod
+    def extract_model_type(full_model_name):
+        model_config = AutoConfig.from_pretrained(full_model_name)
+        config_json_file = model_config.get_config_dict(full_model_name)[0]
+        try:
+            model_type = config_json_file["model_type"]
+        except KeyError:
+            model_type = JobID._extract_model_type_with_keywords_match()
+        return model_type
+
+    def set_jobid_from_console_args(self, console_args):
+        self.dat = console_args.dataset_subdataset_name.split(":")[0].split(",")
+        self.subdat = console_args.dataset_subdataset_name.split(":")[1]
+        self.mod = console_args.algo_mode
+        self.spa = console_args.space_mode
+        self.arg = console_args.search_alg_args_mode
+        self.alg = console_args.algo_name
+        self.pru = console_args.pruner
+        self.pre_full = console_args.pretrained_model_size.split(":")[0]
+        self.pre = JobID.extract_model_type(self.pre_full)
+        self.presz = console_args.pretrained_model_size.split(":")[1]
+        self.spt = console_args.resplit_mode
+        self.rep = console_args.rep_id
+        self.sddt = console_args.seed_data
+        self.sdhf = console_args.seed_transformers
+
+    @staticmethod
+    def legacy_old_blobname_to_new_blobname(self,
+                                            old_blobname):
+        spa_id2val = {
+            0: "gnr",
+            1: "uni"
+        }
+        alg_id2val = {
+            0: "bs",
+            1: "optuna",
+            2: "cfo"
+        }
+        pre_id2val = {
+            0: "xlnet-base-cased",
+            1: "albert-large-v1",
+            2: "distilbert-base-uncased",
+            3: "microsoft/deberta-base",
+            4: "funnel-transformer/small-base",
+            5: "microsoft/deberta-large",
+            6: "funnel-transformer/large-base",
+            7: "funnel-transformer/intermediate-base",
+            8: "funnel-transformer/xlarge-base"
+        }
+        presz_id2val = {
+            0: "base",
+            1: "small",
+            2: "base",
+            3: "base",
+            4: "base",
+            5: "large",
+            6: "large",
+            7: "intermediate",
+            8: "xlarge"
+        }
+        spt_id2val = {
+            0: "rspt",
+            1: "ori"
+        }
+        result_grid = re.search(r".*_mod(el)?(?P<model_id>\d+)_None_None(_spt(?P<split_id>\d+))?_rep(?P<rep_id>\d+).log",
+                                old_blobname)
+        result = re.search(
+            r".*_mod(el)?(?P<model_id>\d+)_(alg)?(?P<algo_id>\d+)_(spa)?"
+            r"(?P<space_id>\d+)(_spt(?P<split_id>\d+))?_rep(?P<rep_id>\d+).log",
+            old_blobname)
+        if result_grid:
+            dat = [old_blobname.split("/")[1].split("_")[0]]
+            subdat = old_blobname.split("/")[1].split("_")[1]
+            mod = "hpo"
+            spa = None
+            arg = None
+            alg = None
+            pru = None
+            pre = pre_id2val[int(result_grid.group("model_id"))]
+            presz = presz_id2val[int(result_grid.group("model_id"))]
+            try:
+                spt = spt_id2val[int(result_grid.group("split_id"))]
+            except KeyError:
+                spt = spt_id2val[0]
+            rep = None
+            self.set_jobid_from_arg_list(dat, subdat, mod, spa, arg, alg, pru, pre, presz, spt, rep)
+            return self.to_jobid_string()
+        if result:
+            dat = [old_blobname.split("/")[1].split("_")[0]]
+            subdat = old_blobname.split("/")[1].split("_")[1]
+            mod = "hpo"
+            spa = spa_id2val[int(result.group("space_id"))]
+            arg = "dft"
+            alg = alg_id2val[int(result.group("algo_id"))]
+            pru = "None"
+            pre = pre_id2val[int(result_grid.group("model_id"))]
+            presz = presz_id2val[int(result_grid.group("model_id"))]
+            try:
+                spt = spt_id2val[int(result_grid.group("split_id"))]
+            except KeyError:
+                spt = spt_id2val[0]
+            rep = int(result.group("rep_id"))
+            self.set_jobid_from_arg_list(dat, subdat, mod, spa, arg, alg, pru, pre, presz, spt, rep)
+            return self.to_jobid_string()
+        return None
+
+
+class AzureUtils:
+
+    def __init__(self,
+                 root_log_path=None,
+                 console_args=None,
+                 jobid=None,
+                 autohf=None):
+        if root_log_path:
+            self.root_log_path = root_log_path
+        else:
+            self.root_log_path = "logs_azure"
+        self.jobid = jobid
+        self.console_args = console_args
+        self.autohf = autohf
+        if console_args:
+            wandb_key, azure_key, container_name = get_wandb_azure_key(console_args.key_path)
+            self._container_name = container_name
+            self._azure_key = azure_key
+
+    def _get_complete_connection_string(self):
+        return "DefaultEndpointsProtocol=https;AccountName=docws5141197765;AccountKey=" \
+               + self._azure_key + ";EndpointSuffix=core.windows.net"
+
+    def _init_azure_clients(self):
+        connection_string = self._get_complete_connection_string()
+        container_client = ContainerClient.from_connection_string(conn_str=connection_string,
+                                                                  container_name=self._container_name)
+        return container_client
+
+    def _init_blob_client(self,
+                          local_file_path):
+        connection_string = self._get_complete_connection_string()
+        blob_service_client = BlobServiceClient.from_connection_string(connection_string)
+        blob_client = blob_service_client.get_blob_client(container=self._container_name, blob=local_file_path)
+        return blob_client
+
+    def upload_local_file_to_azure(self, local_file_path):
+        blob_client = self._init_blob_client(local_file_path)
+        with open(local_file_path, "rb") as fin:
+            blob_client.upload_blob(fin, overwrite=True)
+
+    def download_azure_blob(self, blobname):
+        blob_client = self._init_blob_client(blobname)
+        pathlib.Path(re.search("(?P<parent_path>^.*)/[^/]+$", blobname).group("parent_path")).mkdir(
+            parents=True, exist_ok=True)
+        with open(blobname, "wb") as fout:
+            fout.write(blob_client.download_blob().readall())
+
+    def write_exception(self):
+        result_json = {
+            "timestamp": datetime.now(),
+        }
+        local_file_path = self.generate_local_json_path()
+        self.create_local_json_and_upload(result_json, local_file_path)
+
+    def extract_log_from_analysis(self,
+                                  analysis):
+        """
+            Extracting a json object for storing the key information returned from tune.run
+        """
+        json_log = []
+        for each_trial in analysis.trials:
+            trial_id = each_trial.trial_id
+            start_time = each_trial.start_time
+            last_update_time = each_trial.last_update_time
+            config = each_trial.config
+            try:
+                metric_score = each_trial.metric_analysis["eval_" + analysis.default_metric]
+                time_stamp = each_trial.metric_analysis['timestamp']
+                json_log.append({"trial_id": trial_id,
+                                 "start_time": start_time,
+                                 "last_update_time": last_update_time,
+                                 "config": config,
+                                 "metric_score": metric_score,
+                                 "time_stamp": time_stamp})
+            except KeyError:
+                pass
+        return json_log
+
+    def write_autohf_output(self,
+                            json_log=None,
+                            valid_metric=None,
+                            predictions=None,
+                            duration=None):
+        """
+            write the key info from a job and upload to azure blob storage
+        """
+        local_file_path = self.generate_local_json_path()
+        output_json = {}
+        if json_log:
+            output_json["val_log"] = json_log
+        if valid_metric:
+            output_json["valid_metric"] = valid_metric
+        if duration:
+            output_json["duration"] = duration
+        if len(output_json) > 0:
+            self.create_local_json_and_upload(output_json, local_file_path)
+        if predictions is not None:
+            self.create_local_prediction_and_upload(local_file_path, predictions)
+
+    def generate_local_json_path(self):
+        """
+            return a path string for storing the json file locally
+        """
+        full_dataset_name = self.jobid.get_jobid_full_data_name()
+        jobid_str = self.jobid.to_jobid_string()
+        local_file_path = os.path.join(self.root_log_path, full_dataset_name, jobid_str + ".json")
+        pathlib.Path(os.path.join(self.root_log_path, full_dataset_name)).mkdir(parents=True, exist_ok=True)
+        return local_file_path
+
+    def create_local_json_and_upload(self, result_json, local_file_path):
+        with open(local_file_path, "w") as fout:
+            fout.write(json.dumps(result_json))
+            fout.flush()
+            self.upload_local_file_to_azure(local_file_path)
+
+    def legacy_to_json(self):
+        container_client = self._init_azure_clients()
+        for old_blob in container_client.list_blobs():
+            new_jobid_str = self.jobid.legacy_old_blobname_to_new_blobname(old_blob.name)
+            if new_jobid_str:
+                self.download_azure_blob(old_blob.name)
+                with open(old_blob.name, "r") as fin:
+                    alllines = fin.readlines()
+                    wandb_group_name = alllines[0].rstrip("\n:")
+                    timestamp = re.search(
+                        r"timestamp:(?P<timestamp>.*):",
+                        alllines[1].strip("\n")).group("timestamp")
+                    duration = re.search(
+                        r"duration:(?P<duration>.*)$",
+                        alllines[3].strip("\n")).group("duration")
+                    sample_num = int(re.search(
+                        r"sample_num: (?P<sample_num>\d+)$",
+                        alllines[4].strip("\n")).group("sample_num"))
+                    validation = {"accuracy": float(re.search(
+                        "validation accuracy: (?P<validation>.*)$",
+                        alllines[2].strip("\n")).group("validation"))}
+                    test = None
+                    if len(alllines) > 6:
+                        result_test = re.search("test accuracy:(?P<test>.*)$", alllines[6].strip("\n"))
+                        if result_test:
+                            test = json.loads(result_test.group("test"))
+                    yml_file = None
+                    if len(alllines) > 8:
+                        if alllines[8].startswith("aml"):
+                            yml_file = alllines[8].strip("\n")
+                    new_json = {"wandb_group_name": wandb_group_name,
+                                "validation": validation,
+                                "test": test,
+                                "timestamp": timestamp,
+                                "duration": duration,
+                                "sample_num": sample_num,
+                                "yml_file": yml_file}
+                    full_dataset_name = self.jobid.get_jobid_full_data_name()
+                    new_blobname = os.path.join("logs_azure/", full_dataset_name, new_jobid_str + ".json")
+                    self.create_local_json_and_upload(new_json, new_blobname)
+
+    def create_local_prediction_and_upload(self,
+                                           local_json_file,
+                                           predictions):
+        """
+            store predictions (a .zip file) locally and upload
+        """
+        azure_save_file_name = local_json_file.split("/")[-1][:-5]
+        local_archive_path = self.autohf.output_prediction(predictions,
+                                                           output_prediction_path=self.console_args.data_root_dir + "result/",
+                                                           output_zip_file_name=azure_save_file_name)
+        self.upload_local_file_to_azure(local_archive_path)
+
+    def get_ranked_configs(self, metric_mode):
+        """
+            extract the configs (ranked in descebding order by the score) for the azure file of the current object
+            (defined by self.jobid_config)
+        """
+        azure_file_path = self.generate_local_json_path()
+        self.download_azure_blob(azure_file_path)
+
+        json_log = json.load(open(azure_file_path, "r"))
+        assert "val_log" in json_log
+
+        trialid_to_score = {}
+        trialid_to_config = {}
+
+        for each_entry in json_log["val_log"]:
+            trial_id = each_entry["trial_id"]
+            config = each_entry["config"]
+            this_score = each_entry["metric_score"][metric_mode]
+            trialid_to_config[trial_id] = config
+            trialid_to_score[trial_id] = this_score
+
+        sorted_trialid_to_score = sorted(trialid_to_score.items(), key=lambda x: x[1], reverse=True)
+        return [trialid_to_config[entry[0]] for entry in sorted_trialid_to_score]
+
+    @staticmethod
+    def is_after_earliest_time(this_blob, earliest_time):
+        import pytz
+        utc = pytz.UTC
+        if this_blob.last_modified >= utc.localize(datetime(earliest_time[0], earliest_time[1], earliest_time[2])):
+            return True
+        return False
+
+    def get_blob_list_matching_partial_jobid(self, root_log_path, partial_jobid, earliest_time=None):
+        """
+            get all blobs whose jobid configs match the partial_jobid
+        """
+        blob_list = []
+        container_client = self._init_azure_clients()
+        jobid_config = JobID()
+        for each_blob in container_client.list_blobs():
+            if each_blob.name.startswith(root_log_path):
+                each_jobconfig = jobid_config.convert_blobname_to_jobid(each_blob.name)
+                is_append = False
+                if each_jobconfig:
+                    if each_jobconfig.is_match(partial_jobid):
+                        is_append = True
+                    if earliest_time and not AzureUtils.is_after_earliest_time(each_blob, earliest_time):
+                        is_append = False
+                    if is_append:
+                        blob_list.append((each_jobconfig, each_blob))
+        return blob_list
+
+    @staticmethod
+    def extract_config_and_score(blobname):
+        data_json = json.load(open(blobname, "r"))
+        return [(x['config'], x['metric_score']["max"], x['start_time']) for x in data_json['val_log']]
+
+    def get_config_and_score_from_partial_jobid(self,
+                                                root_log_path,
+                                                partial_jobid,
+                                                group_attrs,
+                                                method,
+                                                earliest_time=None):
+        """
+            get the best config and best score for each job matching the partial_jobid
+        """
+        matched_blob_list = self.get_blob_list_matching_partial_jobid(
+            root_log_path,
+            partial_jobid,
+            earliest_time=earliest_time)
+        group_dict = {}
+        for (each_jobconfig, each_blob) in matched_blob_list:
+            self.download_azure_blob(each_blob.name)
+            config_and_score = AzureUtils.extract_config_and_score(each_blob.name)
+            if method == "unsorted":
+                sorted_config_and_score = config_and_score
+            elif method == "sort_time":
+                sorted_config_and_score = sorted(config_and_score, key=lambda x: x[2], reverse=False)
+            else:
+                sorted_config_and_score = sorted(config_and_score, key=lambda x: x[1], reverse=True)
+            group_attr_list = []
+            for each_attr in group_attrs:
+                group_val = getattr(each_jobconfig, each_attr)
+                if isinstance(group_val, list):
+                    group_attr_list.append(JobID.dataset_list_to_str(group_val, each_attr))
+                else:
+                    group_attr_list.append(group_val)
+            group_attr_tuple = tuple(group_attr_list)
+            group_dict.setdefault(group_attr_tuple, [])
+            group_dict[group_attr_tuple].append([(config, score, each_blob.name)
+                                                 for (config, score, ts) in sorted_config_and_score])
+        return group_dict
+
+    def get_validation_perf(self, console_args=None, partial_jobid_config=None):
+        """
+            get the validation score for all blobs matching the partial_jobid_config
+        """
+        if partial_jobid_config.pre == "electra":
+            dataset_namelist = ["wnli", "rte", "mrpc", "cola", "stsb", "sst2", "qnli", "mnli"]
+        else:
+            dataset_namelist = ["wnli", "rte", "mrpc", "cola", "stsb", "sst2"]
+        dataset_vallist1 = [0] * len(dataset_namelist)
+        dataset_vallist2 = [0] * len(dataset_namelist)
+
+        matched_blob_list = self.get_blob_list_matching_partial_jobid(console_args.azure_root_log_path,
+                                                                      partial_jobid_config)
+        for (each_jobconfig, each_blob) in matched_blob_list:
+            subdat_name = each_jobconfig.subdat
+            self.download_azure_blob(each_blob.name)
+            data_json = json.load(open(each_blob.name, "r"))
+            print(len(data_json["val_log"]))
+            validation_metric = data_json['valid_metric']
+            try:
+                dataset_idx = dataset_namelist.index(subdat_name)
+                dataset_vallist1[dataset_idx], dataset_vallist2[dataset_idx] \
+                    = self.get_validation_metricstr(validation_metric)
+            except ValueError:
+                pass
+        # print(" & ".join(dataset_vallist1))
+        # print(", ,".join(dataset_vallist2))
+
+    def get_validation_metricstr(self, validation_metric):
+        """
+            get a string representing validations for pasting to Google spreadsheet
+        """
+        validation_str1 = validation_str2 = ""
+        is_first = True
+        for key in ["f1", "accuracy", "pearson", "spearmanr", "matthews_correlation"]:
+            if "eval_" + key in validation_metric.keys():
+                if is_first:
+                    validation_str1 += str("%.1f" % (validation_metric["eval_" + key] * 100))
+                    validation_str2 += str(validation_metric["eval_" + key] * 100)
+                    is_first = False
+                else:
+                    validation_str1 += "/" + str("%.1f" % (validation_metric["eval_" + key] * 100))
+                    validation_str2 += "," + str(validation_metric["eval_" + key] * 100)
+        return validation_str1, validation_str2
+
+    def get_test_perf(self, partial_jobid_config=None, result_root_dir=None):
+        """
+            get the test scores for all blobs matching the partial_jobid_config
+        """
+        import shutil
+        from flaml.nlp.dataset.submission_auto import file_name_mapping_glue, output_blank_tsv
+        matched_blob_list = self.get_blob_list_matching_partial_jobid("data/", partial_jobid_config)
+        partial_jobid_str = partial_jobid_config.to_partial_jobid_string()
+        output_dir = os.path.join(result_root_dir, partial_jobid_str)
+        if os.path.exists(output_dir):
+            assert os.path.isdir(output_dir)
+        else:
+            os.mkdir(output_dir)
+        output_blank_tsv(output_dir)
+
+        for (each_jobconfig, each_blob) in matched_blob_list:
+            subdat_name = each_jobconfig.subdat
+            self.download_azure_blob(each_blob.name)
+            import zipfile
+            if os.path.exists(each_blob.name[:-4]):
+                assert os.path.isdir(each_blob.name[:-4])
+            else:
+                os.mkdir(each_blob.name[:-4])
+            with zipfile.ZipFile(each_blob.name, 'r') as zip_ref:
+                zip_ref.extractall(each_blob.name[:-4])
+            src = os.path.join(each_blob.name[:-4], file_name_mapping_glue[subdat_name][0])
+            dst = os.path.join(output_dir, file_name_mapping_glue[subdat_name][0])
+            shutil.copy(src, dst)
+        shutil.make_archive(os.path.join(output_dir), 'zip', output_dir)
+
+    def get_best_perf_config(self, console_args, jobid_config):
+        """
+            get the config of the best performed trial
+        """
+        matched_blob_list = self.get_blob_list_matching_partial_jobid(console_args.azure_root_log_path, jobid_config)
+        try:
+            assert len(matched_blob_list) == 1
+        except AssertionError:
+            import pdb
+            pdb.set_trace()
+
+        each_jobconfig, each_blob = matched_blob_list[0]
+        self.download_azure_blob(each_blob.name)
+        data_json = json.load(open(each_blob.name, "r"))
+
+        sorted_entries = sorted(data_json['val_log'], key=lambda x: x['metric_score']['max'], reverse=True)
+        best_config = sorted_entries[0]['config']
+        if jobid_config.subdat != "mrpc":
+            best_score = sorted_entries[0]['metric_score']['max']
+        else:
+            best_score = (data_json["valid_metric"]["eval_f1"], data_json["valid_metric"]["eval_accuracy"])
+        return best_config, best_score
--- a/flaml/nlp/result_analysis/generate_result_summary.py
+++ b/flaml/nlp/result_analysis/generate_result_summary.py
@@ -0,0 +1,357 @@
+def extract_ranked_config_score(console_args, partial_config_dict):
+    from .azure_utils import AzureUtils
+    azure_utils = AzureUtils(console_args=console_args)
+
+    for method, each_partial_config in partial_config_dict.items():
+        dataset2configscorelist = azure_utils.get_config_and_score_from_partial_config(each_partial_config,
+                                                                                       ["dat", "subdat"], method)
+        for each_dataset, configscorelist in dataset2configscorelist.items():
+            for config_idx in range(len(configscorelist)):
+                avg_scores = configscorelist[config_idx][0][1]
+                top_config = configscorelist[config_idx][0][0]
+                print(avg_scores)
+                print(top_config)
+                # print(method + "," + str(each_dataset) + ",rep=" + str(config_idx))
+                # print("avg score :" + str(avg_scores))
+                # print(''.join(['{0}={1}\n'.format(key, top_config[key]) for key in sorted(top_config.keys())]))
+
+
+def extract_sorted_config_list(dataset2configscorelist, topk):
+    dataset2topkconfigs = {}
+    for dataset, configscorelist in dataset2configscorelist.items():
+        all_configscorelist = []
+        for scorelist in configscorelist:
+            for item in scorelist:
+                if item[0] not in [x[0] for x in all_configscorelist]:
+                    all_configscorelist.append(item)
+        sorted_all_configscorelist = sorted(all_configscorelist, key=lambda x: x[1], reverse=True)
+        topk_configs = []
+
+        for each_hp in ("learning_rate", "num_train_epochs", "per_device_train_batch_size", "warmup_ratio",
+                        "weight_decay", "adam_epsilon"):
+            topk_configs.append((each_hp, [sorted_all_configscorelist[x][0][each_hp] for x in range(topk)]))
+        topk_configs.append(("perf", [sorted_all_configscorelist[x][1] for x in range(topk)]))
+
+        dataset2topkconfigs[dataset] = topk_configs
+    return dataset2topkconfigs
+
+
+def dict2tuple(this_dict):
+    tuple_list = []
+    for key in sorted(this_dict.keys()):
+        tuple_list.append(this_dict[key])
+    return tuple(tuple_list)
+
+
+def merge_configscore_list(small_dataset2configscorelist):
+    dataset2merged_configscorelist = {}
+    for (dataset, each_configscore_list) in small_dataset2configscorelist.items():
+        merged_configscore_list = {}
+        for rep_id in range(len(each_configscore_list)):
+            for each_configscore_entry in each_configscore_list[rep_id]:
+                is_exist = False
+                for configscore in merged_configscore_list.keys():
+                    if configscore[0] == each_configscore_entry[0]:
+                        is_exist = True
+                        break
+                if is_exist is False:
+                    merged_configscore_list[dict2tuple(each_configscore_entry[0])] = each_configscore_entry[1]
+        dataset2merged_configscorelist[dataset] = merged_configscore_list
+    return dataset2merged_configscorelist
+
+
+def get_result(console_args, partial_jobid_config):
+    from .azure_utils import AzureUtils, JobID
+    azure_utils = AzureUtils(console_args=console_args)
+    dataset2configscorelist = \
+        azure_utils.get_config_and_score_from_partial_config(
+            console_args.azure_root_log_path,
+            partial_jobid_config,
+            ["dat", "subdat"],
+            "hpo")
+    for dataset, configscore_list in dataset2configscorelist.items():
+        for rep_id in range(len(configscore_list)):
+            config_dict = configscore_list[rep_id][0][0]
+            score = configscore_list[rep_id][0][1]
+            print(dataset, rep_id)
+            print_config(config_dict)
+            print(score)
+            print()
+
+
+def print_config(config_dict):
+    for key in sorted(config_dict.keys()):
+        if key in ("attention_probs_dropout_prob", "hidden_dropout_prob", "seed"):
+            continue
+        if key == "per_device_train_batch_size":
+            short_key = "batch_size"
+        elif key == "num_train_epochs":
+            short_key = "epochs"
+        else:
+            short_key = key
+        print(short_key, config_dict[key])
+
+
+def compare_small_vs_large(console_args):
+    from .azure_utils import AzureUtils, JobID
+    azure_utils = AzureUtils(console_args=console_args)
+
+    partial_jobid_config = JobID()
+    partial_jobid_config.pre = "deberta"
+    partial_jobid_config.mod = "hpo"
+    partial_jobid_config.spa = "uni"
+    partial_jobid_config.presz = "base"
+
+    small_dataset2configscorelist = azure_utils.get_config_and_score_from_partial_config(partial_jobid_config,
+                                                                                         ["dat", "subdat"], "list")
+
+    small_mergedconfiglist = merge_configscore_list(small_dataset2configscorelist)
+
+    partial_jobid_config = JobID()
+    partial_jobid_config.pre = "deberta"
+    partial_jobid_config.mod = "hpo"
+    partial_jobid_config.spa = "uni"
+    partial_jobid_config.presz = "large"
+
+    large_dataset2configscorelist = azure_utils.get_config_and_score_from_partial_config(partial_jobid_config,
+                                                                                         ["dat", "subdat"], "hpo")
+
+    large_mergedconfiglist = merge_configscore_list(large_dataset2configscorelist)
+
+    for (each_dataset, merged_small_configlist) in small_mergedconfiglist.items():
+        merged_large_configlist = large_mergedconfiglist[each_dataset]
+        print(each_dataset)
+        print()
+        for (each_tuple, large_score) in sorted(merged_large_configlist.items(), key=lambda x: x[1], reverse=True):
+            # small_score = merged_small_configlist[each_tuple]
+            is_in_onlysmall = each_tuple in small_mergedconfiglist[each_dataset]
+            for each_val in each_tuple:
+                print(each_val, end=", ")
+            print(large_score, is_in_onlysmall, sep=",")
+        print()
+        for (each_tuple, small_score) in \
+                sorted(small_mergedconfiglist[each_dataset].items(), key=lambda x: x[1], reverse=True):
+            is_in_large = each_tuple in large_mergedconfiglist[each_dataset]
+            for each_val in each_tuple:
+                print(each_val, end=", ")
+            print(small_score, is_in_large, sep=",")
+
+
+def check_conflict(console_args, partial_jobid_config_list):
+    from .azure_utils import AzureUtils, JobID
+    azure_utils = AzureUtils(console_args=console_args)
+    for each_partial_config in partial_jobid_config_list:
+        dataset2configscorelist = \
+            azure_utils.get_config_and_score_from_partial_config(
+                console_args.azure_root_log_path,
+                each_partial_config,
+                ["dat", "subdat"],
+                "unsorted")
+        for (dataset, configscorelists) in dataset2configscorelist.items():
+            config2score = {}
+            for each_configscorelist in configscorelists:
+                for (config, score, blobname) in each_configscorelist:
+                    config_dict = dict2tuple(config)
+                    try:
+                        config2score[config_dict].append((score, blobname))
+                    except KeyError:
+                        config2score.setdefault(config_dict, [])
+                        config2score[config_dict].append((score, blobname))
+            dup_keys = [config for config in config2score.keys() if len(config2score[config]) > 1]
+            dupkey_count = [len(set([y[0] for y in config2score[x]])) for x in dup_keys]
+            print(dataset)
+            print(len(config2score))
+            print(len(dupkey_count))
+            print(dupkey_count)
+
+
+def print_cfo(console_args):
+    from .azure_utils import JobID, AzureUtils
+    jobid_config = JobID()
+    jobid_config.mod = "bestnn"
+    jobid_config.spa = "buni"
+    jobid_config.alg = "bs"
+    jobid_config.pre = "funnel"
+    jobid_config.presz = "xlarge"
+
+    for each_rep in range(3):
+        jobid_config.rep = each_rep
+        azure_utils = AzureUtils(console_args=console_args, jobid=jobid_config)
+
+        dataset2configscorelist = \
+            azure_utils.get_config_and_score_from_partial_config(
+                console_args.azure_root_log_path,
+                jobid_config,
+                ["dat", "subdat"],
+                "sort_time")
+        dataset = ('glue', 'mrpc')
+        configscorelist = dataset2configscorelist[dataset]
+        count = 0
+        print(dataset)
+        for (config, score, blobname) in sorted(configscorelist[0], key=lambda x: x[1], reverse=True)[0:1]:
+            print(count)
+            print(score)
+            print_config(config)
+            print()
+            count += 1
+
+
+def download_validation(console_args, result_root_dir):
+    from .azure_utils import JobID, AzureUtils
+    partial_jobid_config = JobID()
+    partial_jobid_config.mod = "grid"
+    partial_jobid_config.pre = "roberta"
+    partial_jobid_config.presz = "base"
+    # partial_jobid_config.alg = "optuna"
+    # partial_jobid_config.pru = "asha"
+    partial_jobid_config.rep = 0
+
+    azure_utils = AzureUtils(console_args=console_args, jobid=partial_jobid_config)
+    azure_utils.get_validation_perf(console_args=console_args, partial_jobid_config=partial_jobid_config)
+    azure_utils.get_test_perf(partial_jobid_config, result_root_dir)
+
+
+def get_result_str(jobid_config, val_score, test_score, best_config, subdat2config=None, mode="grid"):
+    result_str = jobid_config.subdat.upper() + ","
+    if jobid_config.alg:
+        result_str += jobid_config.alg.upper().replace("OPTUNA", "Optuna")
+    if jobid_config.pru is not None and jobid_config.pru != "None":
+        result_str += "+" + jobid_config.pru.upper()
+    if jobid_config.subdat != "mrpc":
+        result_str += ",rep " + str(jobid_config.rep) + " & " + str(
+            "%.1f" % (val_score * 100)) + " & " + str(test_score)
+    else:
+        result_str += ",rep " + str(jobid_config.rep) + " & " + str(
+            "%.1f" % (val_score[0] * 100)) + "/" + str(
+            "%.1f" % (val_score[1] * 100)) + " & " + str(test_score)
+    for hp in ["learning_rate", "warmup_ratio", "per_device_train_batch_size", "hidden_dropout", "attention_dropout",
+               "weight_decay"]:
+        if hp not in best_config:
+            result_str += " & "
+        else:
+            if mode == "hpo":
+                if best_config[hp] > 1.2 * subdat2config[jobid_config.subdat][hp]:
+                    wrap_left = "\\cellcolor{green!85}{"
+                elif best_config[hp] > subdat2config[jobid_config.subdat][hp]:
+                    wrap_left = "\\cellcolor{green!15}{"
+                elif best_config[hp] < subdat2config[jobid_config.subdat][hp] / 1.2:
+                    wrap_left = "\\cellcolor{red!85}{"
+                else:
+                    wrap_left = "\\cellcolor{red!15}{"
+                wrap_right = "}"
+            else:
+                wrap_left = wrap_right = ""
+            if hp == "per_device_train_batch_size" or hp == "learning_rate":
+                wrap_left = wrap_right = ""
+            if hp == "learning_rate":
+                result_str += " & " + wrap_left + "{:.1e}".format(best_config[hp]) + wrap_right
+            elif hp == "per_device_train_batch_size":
+                result_str += " & " + wrap_left + str(best_config[hp]) + wrap_right
+            else:
+                result_str += " & " + wrap_left + str("%.3f" % best_config[hp]) + wrap_right
+    return result_str + "\\\\"
+
+
+def extract_grid(console_args, jobid_config, overfitting_subdat, test_scores):
+    from .azure_utils import JobID, AzureUtils
+    key2printstr = {}
+    subdat2config = {}
+    for idx in range(len(overfitting_subdat)):
+        jobid_config.subdat = overfitting_subdat[idx]
+        jobid_config.mod = "grid"
+        jobid_config.rep = 0
+        azure_utils = AzureUtils(console_args=console_args, jobid=jobid_config)
+        best_config, val_score = azure_utils.get_best_perf_config(console_args, jobid_config)
+        best_config["hidden_dropout"] = 0.1
+        best_config["attention_dropout"] = 0.1
+        test_score = test_scores[idx]
+        key2printstr[jobid_config.subdat.upper() + ", grid"] = get_result_str(jobid_config, val_score,
+                                                                              test_score, best_config)
+        subdat2config[jobid_config.subdat] = best_config
+    print()
+    for key, printstr in sorted(key2printstr.items(), key=lambda x: x[0]):
+        print(printstr)
+    return subdat2config
+
+
+def extract_hpo(
+        console_args,
+        jobid_config,
+        overfitting_subdat,
+        overfitting_alg,
+        overfitting_pru,
+        overfitting_rep,
+        subdat2config,
+        test_scores):
+    from .azure_utils import AzureUtils
+    key2printstr = {}
+    for idx in range(len(overfitting_subdat)):
+        jobid_config.subdat = overfitting_subdat[idx]
+        jobid_config.alg = overfitting_alg[idx]
+        jobid_config.pru = overfitting_pru[idx]
+        jobid_config.rep = overfitting_rep[idx]
+        azure_utils = AzureUtils(console_args=console_args, jobid=jobid_config)
+        best_config, val_score = azure_utils.get_best_perf_config(console_args, jobid_config)
+        test_score = test_scores[idx]
+        key2printstr[jobid_config.subdat.upper() + "," + jobid_config.alg.upper() + ","
+                     + jobid_config.pru + ",rep " + str(jobid_config.rep)] \
+            = get_result_str(jobid_config, val_score, test_score, best_config, subdat2config, mode="hpo")
+
+    for key, printstr in sorted(key2printstr.items(), key=lambda x: x[0]):
+        print(printstr)
+
+
+def extract_roberta_overfitting_configs(console_args):
+    from .azure_utils import JobID, AzureUtils
+    jobid_config = JobID()
+    jobid_config.pre = "roberta"
+    jobid_config.presz = "base"
+
+    overfitting_subdat = ["rte", "mrpc", "cola", "sst2", "stsb"]
+    test_scores = ["73.1", "91.4/88.5", "61.4", "96", "89.5/88.7"]
+    subdat2config = extract_grid(console_args, jobid_config, overfitting_subdat, test_scores)
+
+    jobid_config = JobID()
+    jobid_config.pre = "roberta"
+    jobid_config.presz = "base"
+
+    overfitting_subdat = ["rte", "rte", "rte", "mrpc", "mrpc", "mrpc", "sst2",
+                          "rte", "mrpc", "mrpc", "stsb", "sst2", "sst2",
+                          "rte", "rte", "mrpc", "mrpc", "sst2", "sst2"]
+    overfitting_alg = ["rs", "rs", "rs", "rs", "rs", "rs", "rs",
+                       "rs", "rs", "rs", "rs", "rs", "rs",
+                       "optuna", "optuna", "optuna", "optuna", "optuna", "optuna"]
+    overfitting_pru = ["None", "None", "None", "None", "None", "None", "None",
+                       "asha", "asha", "asha", "asha", "asha", "asha",
+                       "asha", "asha", "asha", "asha", "asha", "asha"]
+    overfitting_rep = [0, 1, 2, 0, 1, 2, 0,
+                       1, 0, 2, 2, 1, 2,
+                       1, 2, 0, 1, 1, 2]
+    test_scores = ["71.5", "72.3", "72.2", "90.5/87.1", "90.5/87.4", "90.5/87.2", "95.6",
+                   "72.4", "90.7/87.4", "91.0/87.9", "89.4/88.8", "95.2", "95.7",
+                   "72.4", "72.4", "90.8/87.4", "90.3/86.5", "95.1", "95.8"]
+    extract_hpo(console_args, jobid_config, overfitting_subdat, overfitting_alg, overfitting_pru, overfitting_rep,
+                subdat2config, test_scores)
+
+
+def extract_electra_overfitting_configs(console_args):
+    from .azure_utils import JobID, AzureUtils
+    jobid_config = JobID()
+    jobid_config.pre = "electra"
+    jobid_config.presz = "base"
+
+    overfitting_subdat = ["rte", "qnli", "cola"]
+    test_scores = ["74.4", "93.2", "64.8"]
+    subdat2config = extract_grid(console_args, jobid_config, overfitting_subdat, test_scores)
+
+    jobid_config = JobID()
+    jobid_config.pre = "electra"
+    jobid_config.presz = "base"
+
+    overfitting_subdat = ["rte", "rte", "qnli", "cola", "qnli", "cola"]
+    overfitting_alg = ["rs", "rs", "rs", "rs", "rs", "optuna"]
+    overfitting_pru = ["None", "None", "None", "asha", "asha", "asha"]
+    overfitting_rep = [0, 1, 0, 2, 0, 0]
+    test_scores = ["73.8", "74.3", "92.8", "64.7", "92.9", "63.6"]
+    extract_hpo(console_args, jobid_config, overfitting_subdat, overfitting_alg, overfitting_pru, overfitting_rep,
+                subdat2config, test_scores)
--- a/flaml/nlp/result_analysis/wandb_utils.py
+++ b/flaml/nlp/result_analysis/wandb_utils.py
@@ -0,0 +1,71 @@
+import os
+from ..utils import get_wandb_azure_key
+import subprocess
+import wandb
+import hashlib
+from time import time
+
+
+class WandbUtils:
+
+    # Documentation on the wandb setting:
+    # There are two ways to initialize wandb in tune.run:
+    # (1) using WandbLoggerCallback, by adding the following argument to tune.run:
+    #     callbacks=[WandbLoggerCallback(
+    #                  project="hpo",
+    #                  api_key = os.environ["WANDB_API_KEY"],
+    #                  group = os.environ["WANDB_RUN_GROUP"],
+    #                  log_config=True)]
+    # (2) using wandb_mixin decorator (the current implementation)
+    # The current implementation uses (2) because (1) has the following bug.
+    # In Ray 1.2, when using WandbLoggerCallback + setting time limit using the time_budget_s argument,
+    # A bug exists which is the previous run will not clear the cache after tune.run returns. After the
+    # later run has already starts, some zombie trials in the previous run remain in the memory and never stop.
+    # This bug can be reproduced by switching to (1) by adding the above callbacks argument
+    # and removing the wandb_mixin decorator
+    # https://docs.ray.io/en/master/tune/tutorials/tune-wandb.html
+
+    def __init__(self,
+                 is_wandb_on=None,
+                 console_args=None,
+                 jobid_config=None):
+        if is_wandb_on:
+            wandb_key, azure_key, container_name = get_wandb_azure_key(console_args.key_path)
+            subprocess.run(["wandb", "login", "--relogin", wandb_key])
+            os.environ["WANDB_API_KEY"] = wandb_key
+            os.environ["WANDB_MODE"] = "online"
+        else:
+            os.environ["WANDB_MODE"] = "disabled"
+        self.jobid_config = jobid_config
+
+    def set_wandb_per_trial(self):
+        print("before wandb.init\n\n\n")
+        if os.environ["WANDB_MODE"] == "online":
+            os.environ["WANDB_SILENT"] = "false"
+            return wandb.init(project=self.jobid_config.get_jobid_full_data_name(),
+                              group=self.wandb_group_name,
+                              name=str(WandbUtils._get_next_trial_ids()),
+                              settings=wandb.Settings(
+                                  _disable_stats=True),
+                              reinit=False)
+        else:
+            return None
+
+    @staticmethod
+    def _get_next_trial_ids():
+        hash = hashlib.sha1()
+        hash.update(str(time()).encode('utf-8'))
+        return "trial_" + hash.hexdigest()[:3]
+
+    def set_wandb_per_run(self):
+        os.environ["WANDB_RUN_GROUP"] = self.jobid_config.to_wandb_string() + wandb.util.generate_id()
+        self.wandb_group_name = os.environ["WANDB_RUN_GROUP"]
+        if os.environ["WANDB_MODE"] == "online":
+            os.environ["WANDB_SILENT"] = "false"
+            return wandb.init(project=self.jobid_config.get_jobid_full_data_name(),
+                              group=os.environ["WANDB_RUN_GROUP"],
+                              settings=wandb.Settings(
+                                  _disable_stats=True),
+                              reinit=False)
+        else:
+            return None
--- a/flaml/nlp/utils.py
+++ b/flaml/nlp/utils.py
@@ -0,0 +1,155 @@
+import argparse
+import json
+import os
+import pathlib
+import re
+from dataclasses import dataclass, field
+
+
+def dataset_subdataset_name_format_check(val_str):
+    regex = re.compile(r"^[^:]*:[^:]*$")
+    if not regex.match(val_str):
+        raise argparse.ArgumentTypeError("dataset_subdataset_name must be in the format {data_name}:{subdata_name}")
+    return val_str
+
+
+def pretrained_model_size_format_check(val_str):
+    regex = re.compile(r"^[^:]*:(small|base|large|xlarge)")
+    if not regex.match(val_str):
+        raise argparse.ArgumentTypeError("pretrained_model_size must be in the format {model_name}:{model_size},"
+                                         "where {model_name} is the name from huggingface.co/models, {model_size}"
+                                         "is chosen from small, base, large, xlarge")
+    return val_str
+
+
+def load_console_args(**custom_data_args):
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument('--server_name', type=str, help='server name', required=False,
+                            choices=["tmdev", "dgx", "azureml"], default="tmdev")
+    arg_parser.add_argument('--algo_mode', type=str, help='hpo or grid search', required=False,
+                            choices=["grid", "gridbert", "hpo", "hfhpo", "list_s", "list", "bestnn"], default="hpo")
+    arg_parser.add_argument('--data_root_dir', type=str, help='data dir', required=False, default="data/")
+    arg_parser.add_argument('--dataset_subdataset_name', type=dataset_subdataset_name_format_check,
+                            help='dataset and subdataset name', required=False, default=None)
+    arg_parser.add_argument('--space_mode', type=str, help='space mode', required=False,
+                            choices=["gnr", "uni", "uni_test", "cus", "buni"], default="uni")
+    arg_parser.add_argument('--search_alg_args_mode', type=str, help='search algorithm args mode', required=False,
+                            choices=["dft", "exp", "cus"], default="dft")
+    arg_parser.add_argument('--algo_name', type=str, help='algorithm', required=False,
+                            choices=["bs", "optuna", "cfo", "rs"], default="bs")
+    arg_parser.add_argument('--pruner', type=str, help='pruner', required=False,
+                            choices=["asha", "None"], default="None")
+    arg_parser.add_argument('--pretrained_model_size', type=pretrained_model_size_format_check,
+                            help='pretrained model', required=False, default=None)
+    arg_parser.add_argument('--sample_num', type=int, help='sample num', required=False, default=None)
+    arg_parser.add_argument('--time_budget', type=int, help='time budget', required=False, default=None)
+    arg_parser.add_argument('--time_as_grid', type=int, help='time as grid search', required=False, default=None)
+    arg_parser.add_argument('--rep_id', type=int, help='rep id', required=False, default=0)
+    arg_parser.add_argument('--azure_key', type=str, help='azure key', required=False, default=None)
+    arg_parser.add_argument('--resplit_mode', type=str, help='resplit mode', required=False,
+                            choices=["rspt", "ori"], default="ori")
+    arg_parser.add_argument('--ds_config', type=str, help='deep speed config file path',
+                            required=False, default=None)
+    arg_parser.add_argument('--yml_file', type=str, help='yml file path', required=False, default="test.yml")
+    arg_parser.add_argument('--key_path', type=str, help='path for key.json', required=False, default=None)
+    arg_parser.add_argument('--root_log_path', type=str, help='root path for log', required=False, default="logs_azure")
+    arg_parser.add_argument('--round_idx', type=int, help='round idx for acl experiments', required=False, default=0)
+    arg_parser.add_argument('--seed_data', type=int, help='seed of data shuffling', required=False, default=43)
+    arg_parser.add_argument('--seed_transformers', type=int, help='seed of transformers', required=False, default=42)
+    args, unknown = arg_parser.parse_known_args()
+
+    for each_key in custom_data_args.keys():
+        if args.__contains__(each_key):
+            try:
+                check_key_format_func = globals()[each_key + "_format_check"]
+                check_key_format_func(custom_data_args[each_key])
+            except KeyError:
+                pass
+            setattr(args, each_key, custom_data_args[each_key])
+    return args
+
+
+def get_wandb_azure_key(key_path):
+    key_json = json.load(open(os.path.join(key_path, "key.json"), "r"))
+    wandb_key = key_json["wandb_key"]
+    azure_key = key_json["azure_key"]
+    azure_container_name = key_json["container_name"]
+    return wandb_key, azure_key, azure_container_name
+
+
+def merge_dicts(dict1, dict2):
+    for key2 in dict2.keys():
+        if key2 in dict1:
+            dict1_vals = set(dict1[key2])
+            dict2_vals = set(dict2[key2])
+            dict1[key2] = list(dict1_vals.union(dict2_vals))
+        else:
+            dict1[key2] = dict2[key2]
+    return dict1
+
+
+def _check_dict_keys_overlaps(dict1: dict, dict2: dict):
+    dict1_keys = set(dict1.keys())
+    dict2_keys = set(dict2.keys())
+    return len(dict1_keys.intersection(dict2_keys)) > 0
+
+
+def _variable_override_default_alternative(logger, obj_ref, var_name, default_value, all_values, overriding_value=None):
+    """
+        Setting the value of var. If overriding_value is specified, var is set to overriding_value;
+        If overriding_value is not specified, var is set to default_value meanwhile showing all_values
+    """
+    assert isinstance(all_values, list)
+    if overriding_value:
+        setattr(obj_ref, var_name, overriding_value)
+        logger.warning("The value for {} is specified as {}".format(var_name, overriding_value))
+    else:
+        setattr(obj_ref, var_name, default_value)
+        logger.warning("The value for {} is not specified, setting it to the default value {}. "
+                       "Alternatively, you can set it to {}".format(var_name, default_value, ",".join(all_values)))
+
+
+@dataclass
+class PathUtils:
+    hpo_ckpt_path: str = field(metadata={"help": "the directory for hpo output"})
+    hpo_result_path: str = field(metadata={"help": "the directory for hpo result"})
+    hpo_log_path: str = field(metadata={"help": "the directory for log"})
+    hpo_config_path: str = field(metadata={"help": "the directory for log"})
+
+    log_dir_per_run: str = field(metadata={"help": "log directory for each run."})
+    result_dir_per_run: str = field(metadata={"help": "result directory for each run."})
+    ckpt_dir_per_run: str = field(metadata={"help": "checkpoint directory for each run."})
+    ckpt_dir_per_trial: str = field(metadata={"help": "checkpoint directory for each trial."})
+
+    def __init__(self,
+                 jobid_config,
+                 hpo_data_root_path,
+                 ):
+        self.jobid_config = jobid_config
+        self.hpo_data_root_path = hpo_data_root_path
+        self.hpo_ckpt_path = os.path.join(hpo_data_root_path, "checkpoint")
+        self.hpo_result_path = os.path.join(hpo_data_root_path, "result")
+        self.hpo_log_path = self.hpo_result_path
+
+    @staticmethod
+    def init_and_make_one_dir(dir_path):
+        assert dir_path
+        if not os.path.exists(dir_path):
+            pathlib.Path(dir_path).mkdir(parents=True, exist_ok=True)
+
+    def make_dir_per_run(self):
+        jobid_str = self.jobid_config.to_jobid_string()
+        self.ckpt_dir_per_run = os.path.join(self.hpo_ckpt_path, jobid_str)
+        PathUtils.init_and_make_one_dir(self.ckpt_dir_per_run)
+
+        self.result_dir_per_run = os.path.join(self.hpo_result_path, jobid_str)
+        PathUtils.init_and_make_one_dir(self.result_dir_per_run)
+
+        self.log_dir_per_run = os.path.join(self.hpo_log_path, jobid_str)
+        PathUtils.init_and_make_one_dir(self.log_dir_per_run)
+
+    def make_dir_per_trial(self, trial_id):
+        jobid_str = self.jobid_config.to_jobid_string()
+        ckpt_dir_per_run = os.path.join(self.hpo_ckpt_path, jobid_str)
+        self.ckpt_dir_per_trial = os.path.join(ckpt_dir_per_run, jobid_str, trial_id)
+        PathUtils.init_and_make_one_dir(self.ckpt_dir_per_trial)
--- a/notebook/flaml_autohf.ipynb
+++ b/notebook/flaml_autohf.ipynb
@@ -0,0 +1,43 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "1. Electra Example"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/setup.py
+++ b/setup.py
@@ -51,6 +51,11 @@ setuptools.setup(
            "optuna==2.3.0",
            "vowpalwabbit",
            "openml",
+            "transformers==4.4.1",
+            "wandb==0.10.26",
+            "torch==1.8.1",
+            "datasets==1.4.1",
+            "azure-storage-blob"
        ],
        "blendsearch": [
            "optuna==2.3.0"
--- a/test/hf/run_analysis.py
+++ b/test/hf/run_analysis.py
@@ -0,0 +1,75 @@
+'''Require: pip install torch transformers datasets wandb flaml[blendsearch,ray]
+'''
+# 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
+import argparse
+from flaml.nlp.result_analysis.azure_utils import JobID
+
+
+def create_partial_config_bestnn():
+    jobid_config = JobID()
+    # funnel xlarge
+    # jobid_config.mod = "bestnn"
+    jobid_config.spa = "uni"
+    # jobid_config.arg = "cus"
+    # jobid_config.alg = "cfo"
+    jobid_config.pre = "funnel"
+    jobid_config.presz = "xlarge"
+    # funnel small
+    # jobid_config.mod = "list"
+    # jobid_config.pre = "funnel"
+    # jobid_config.presz = "small"
+    # jobid_config.rep = 0
+
+    # # deberta large
+    # jobid_config.mod = "bestnn"
+    # jobid_config.spa = "uni"
+    # jobid_config.arg = "cus"
+    # jobid_config.alg = "cfo"
+    # jobid_config.pre = "deberta"
+    # jobid_config.presz = "large"
+
+    # # deberta base
+    # jobid_config.mod = "hpo"
+    # jobid_config.pre = "deberta"
+    # jobid_config.presz = "base"
+    # jobid_config.rep = 0
+
+    # # deberta large
+    # jobid_config.mod = "hpo"
+    # jobid_config.pre = "deberta"
+    # jobid_config.presz = "large"
+
+    return jobid_config
+
+
+def create_partial_config_list():
+    jobid_config = JobID()
+    jobid_config.mod = "list"
+    jobid_config.spa = "uni"
+    jobid_config.presz = "xlarge"
+    return jobid_config
+
+
+def create_partial_config_hpo():
+    jobid_config = JobID()
+    jobid_config.mod = "hpo"
+    jobid_config.spa = "uni"
+    return jobid_config
+
+
+if __name__ == "__main__":
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument('--key_path', type=str, help='key path', required=False, default="../../")
+    arg_parser.add_argument('--azure_root_log_path', type=str,
+                            help='root log path of blob storage', required=True, default="logs_azure/")
+    args = arg_parser.parse_args()
+
+    partial_config_large = create_partial_config_bestnn()
+    from flaml.nlp.result_analysis.generate_result_summary import compare_small_vs_large, get_result, check_conflict, \
+        print_cfo, download_validation, extract_roberta_overfitting_configs, extract_electra_overfitting_configs
+
+    # get_result(args, partial_config_large)
+    # check_conflict(args, [partial_config_large])
+    download_validation(args, "/data/xliu127/projects/hyperopt/data/result/")
+
+    # extract_roberta_overfitting_configs(args)
--- a/test/hf/run_autohf.py
+++ b/test/hf/run_autohf.py
@@ -0,0 +1,285 @@
+'''Require: pip install torch transformers datasets wandb flaml[blendsearch,ray]
+'''
+# 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
+import os
+import shutil
+
+from flaml.nlp import AutoTransformers
+from flaml.nlp import AzureUtils, JobID
+from flaml.nlp.utils import load_console_args
+
+global azure_log_path
+global azure_key
+
+
+def get_resplit_portion(jobid_config):
+    if jobid_config.dat == ["glue"] and jobid_config.subdat in {"mnli"}:
+        return {"source": ["train", "validation_matched"], "train": [0, 0.8], "validation": [0.8, 0.9],
+                "test": [0.9, 1.0]}
+    else:
+        return {"source": ["train", "validation"], "train": [0, 0.8], "validation": [0.8, 0.9], "test": [0.9, 1.0]}
+
+
+def get_preparedata_setting(args, jobid_config):
+    preparedata_setting = {
+        "server_name": args.server_name,
+        "data_root_path": args.data_root_dir,
+        "max_seq_length": 128,
+        "jobid_config": jobid_config,
+        "is_wandb_on": True
+    }
+    if jobid_config.spt == 'rspt':
+        preparedata_setting["resplit_portion"] = get_resplit_portion(jobid_config)
+    if ("albert" == jobid_config.pre and jobid_config.dat == ["squad"]) or \
+            ("funnel" in jobid_config.pre and jobid_config.dat[0] in {"imdb", "yelp_review_full", "yelp_polarity",
+                                                                      "amazon_polarity", "amazon_review_multi"}):
+        preparedata_setting["max_seq_length"] = 512
+    if jobid_config.dat[0] == "glue" and jobid_config.subdat == "mnli":
+        preparedata_setting["fold_name"] = ['train', 'validation_matched', 'test_matched']
+    return preparedata_setting
+
+
+def get_autohf_settings(args, **custom_args):
+    autohf_settings = {"resources_per_trial": {"gpu": 1, "cpu": 1},
+                       "num_samples": args.sample_num,
+                       "time_budget": args.time_budget,
+                       "ckpt_per_epoch": 1,
+                       }
+    for other_attr in ["ds_config", "rep_id"]:
+        if hasattr(args, other_attr):
+            autohf_settings[other_attr] = getattr(args, other_attr)
+        else:
+            autohf_settings[other_attr] = None
+    if len(custom_args) > 0:
+        autohf_settings.update(custom_args)
+    return autohf_settings
+
+
+def rm_home_result():
+    from os.path import expanduser
+    home = expanduser("~")
+    if os.path.exists(home + "/ray_results/"):
+        shutil.rmtree(home + "/ray_results/")
+
+
+def get_best_base_config(args, jobid_config, autohf):
+    import copy
+    import re
+    args_small = copy.deepcopy(args)
+    args_small.algo_name = "optuna"
+    args_small.search_alg_args_mode = "dft"
+    args_small.algo_mode = "hpo"
+    args_small.space_mode = "uni"
+    args_small.pruner = "None"
+
+    if "funnel" not in args_small.pretrained_model_size:
+        args_small.algo_mode = "hpo"
+    else:
+        args_small.algo_mode = "list"
+    args_small.sample_num = 10000
+    args_small.time_budget = 3600
+    args_small.rep_id = 0
+    jobid_config_small = JobID(args_small)
+    if jobid_config_small.pre == "deberta":
+        jobid_config_small.presz = "base"
+    else:
+        jobid_config_small.presz = "small"
+    jobid_config_small.pre_full = re.sub("(xlarge|large|intermediate)", jobid_config_small.presz,
+                                         jobid_config_small.pre_full)
+    azure_utils_small = AzureUtils(
+        console_args=args_small,
+        jobid=jobid_config_small,
+        autohf=autohf)
+    preparedata_setting = get_preparedata_setting(args, jobid_config)
+    autohf.prepare_data(**preparedata_setting)
+    autohf.set_metric()
+
+    best_config = azure_utils_small.get_ranked_configs(autohf.metric_mode_name)[0]
+    return best_config
+
+
+def search_base_and_search_lower_lr(args, jobid_config, autohf):
+    best_config = get_best_base_config(args, jobid_config, autohf)
+
+    import copy
+    args_large = copy.deepcopy(args)
+    args_large.time_budget = args.time_budget - 3600
+    args_large.sample_num = 100000
+    args_large.algo_name = args.algo_name
+    args_large.search_alg_args_mode = "cus"
+    args_large.space_mode = "buni"
+    args_large.pruner = "None"
+    jobid_config_large = JobID(args_large)
+    jobid_config_large.presz = jobid_config.presz
+    jobid_config_large.pre_full = jobid_config.pre_full
+    azure_utils_large = AzureUtils(console_args=args_large, jobid=jobid_config_large, autohf=autohf)
+
+    _test_hpo(args_large,
+              jobid_config_large,
+              autohf,
+              azure_utils_large,
+              autohf_settings=get_autohf_settings(args_large, **{"points_to_evaluate": [best_config],
+                                                                 "bound": {"learning_rate": {
+                                                                     "u": best_config["learning_rate"]}}}))
+
+
+def search_base_and_search_around_best(args, jobid_config, autohf):
+    args.algo_name = "bs"
+    args.search_alg_args_mode = "dft"
+    args.spa = "uni"
+    args.pru = "None"
+    best_config = get_best_base_config(args, jobid_config, autohf)
+
+    import copy
+    args_large = copy.deepcopy(args)
+    args_large.time_budget = args.time_budget - 3600
+    args_large.sample_num = 100000
+    args_large.algo_name = "cfo"
+    args_large.search_alg_args_mode = "cus"
+    args_large.space_mode = "uni"
+    jobid_config_large = JobID(args_large)
+    jobid_config_large.presz = jobid_config.presz
+    jobid_config_large.pre_full = jobid_config.pre_full
+    azure_utils_large = AzureUtils(console_args=args_large, jobid=jobid_config_large, autohf=autohf)
+
+    _test_hpo(args_large,
+              jobid_config_large,
+              autohf,
+              azure_utils_large,
+              autohf_settings=get_autohf_settings(args_large, **{"points_to_evaluate": [best_config]}))
+
+
+def evaluate_configs(autohf, args, ranked_all_configs):
+    import copy
+    this_args = copy.deepcopy(args)
+    this_args.time_budget = 100000
+    this_args.sample_num = int(len(ranked_all_configs))
+    this_args.search_alg_args_mode = "cus"
+    jobid_config = JobID(this_args)
+    azure_utils_large = AzureUtils(console_args=this_args, jobid=jobid_config, autohf=autohf)
+    _test_hpo(this_args,
+              jobid_config,
+              autohf,
+              azure_utils_large,
+              autohf_settings=get_autohf_settings(this_args, **{"points_to_evaluate": ranked_all_configs}))
+
+
+def convert_config_to_different_size(origin_config, mode):
+    import re
+    import copy
+    if mode == "small":
+        new_config = copy.deepcopy(origin_config)
+        if new_config.pre == "funnel":
+            new_config.mod = "list"
+        else:
+            new_config.mod = "hpo"
+        if new_config.pre == "funnel":
+            new_config.presz = "small"
+        else:
+            new_config.presz = "base"
+        new_config.pre_full = re.sub("(xlarge|large|intermediate)", new_config.presz, origin_config.pre_full)
+    elif mode == "large":
+        new_config = copy.deepcopy(origin_config)
+        new_config.mod = "hpo"
+        if new_config.pre == "funnel":
+            new_config.presz = "xlarge"
+            new_config.pre_full = re.sub("(small)", "xlarge", origin_config.pre_full)
+        else:
+            new_config.presz = "large"
+            new_config.pre_full = re.sub("(small)", "large", origin_config.pre_full)
+
+    return new_config
+
+
+def evaluate_small_best_configs_on_large(large_args, autohf):
+    jobid_config_small = convert_config_to_different_size(JobID(large_args), mode="small")
+    jobid_config_small.rep = 0
+    azure_utils_small = AzureUtils(console_args=None, jobid=jobid_config_small, autohf=autohf)
+    ranked_all_small_configs = azure_utils_small.get_ranked_configs(autohf.metric_mode_name)
+    evaluate_configs(large_args, ranked_all_small_configs[:int(len(ranked_all_small_configs) / 2)])
+
+
+def add_dict_item_to_list(this_list, this_dict):
+    is_exist = len([x for x in this_list if x == this_dict]) > 0
+    if not is_exist:
+        this_list.append(this_dict)
+    return this_list
+
+
+def evaluate_large_best_configs_on_small(small_args, autohf):
+    jobid_config_large = convert_config_to_different_size(JobID(small_args), mode="large")
+    autohf.jobid_config = jobid_config_large
+    autohf.set_metric()
+    all_configs_from_large = []
+    for rep_id in range(3):
+        jobid_config_large.rep = rep_id
+        azure_utils_large = AzureUtils(console_args=small_args, jobid=jobid_config_large, autohf=autohf)
+        ranked_all_large_configs = azure_utils_large.get_ranked_configs(autohf.metric_mode_name)
+        for each_config in ranked_all_large_configs:
+            all_configs_from_large = add_dict_item_to_list(all_configs_from_large, each_config)
+    jobid_config_small = convert_config_to_different_size(JobID(small_args), mode="small")
+    jobid_config_small.rep = 0
+    azure_utils_small = AzureUtils(console_args=small_args, jobid=jobid_config_small, autohf=autohf)
+    ranked_all_small_configs = azure_utils_small.get_ranked_configs(autohf.metric_mode_name)
+    for each_config in ranked_all_small_configs:
+        all_configs_from_large = add_dict_item_to_list(all_configs_from_large, each_config)
+
+    evaluate_configs(autohf, small_args, list(all_configs_from_large))
+
+
+def _test_hpo(args,
+              jobid_config,
+              autohf,
+              azure_utils=None,
+              autohf_settings=None,
+              ):
+    try:
+        if not azure_utils:
+            azure_utils = AzureUtils(console_args=args, jobid=jobid_config, autohf=autohf)
+        preparedata_setting = get_preparedata_setting(args, jobid_config)
+        autohf.prepare_data(**preparedata_setting)
+
+        analysis = validation_metric = test_metric = None
+        if not autohf_settings:
+            autohf_settings = get_autohf_settings(args)
+        if args.algo_mode != "hfhpo":
+            validation_metric, analysis = autohf.fit(**autohf_settings, )
+        else:
+            autohf.fit_hf(**autohf_settings)
+
+        if jobid_config.spt == "ori":
+            predictions, test_metric = autohf.predict()
+            if validation_metric:
+                test_metric.update({"validation": validation_metric})
+        else:
+            predictions = None
+            if test_metric:
+                validation_metric.update({"test": test_metric})
+
+        if analysis is not None:
+            json_log = azure_utils.extract_log_from_analysis(analysis)
+        else:
+            json_log = None
+        azure_utils.write_autohf_output(json_log=json_log,
+                                        valid_metric=validation_metric,
+                                        predictions=predictions,
+                                        duration=autohf.last_run_duration)
+
+    except AssertionError:
+        azure_utils.write_exception()
+    rm_home_result()
+
+
+if __name__ == "__main__":
+    autohf = AutoTransformers()
+    args = load_console_args()
+    jobid_config = JobID(args)
+
+    if args.algo_mode in ("hpo", "hfhpo", "grid", "gridbert"):
+        _test_hpo(args, jobid_config, autohf)
+    elif args.algo_mode == "bestnn":
+        search_base_and_search_lower_lr(args, jobid_config, autohf)
+    elif args.algo_mode == "list":
+        evaluate_small_best_configs_on_large(args, autohf)
+    elif args.algo_mode == "list_s":
+        evaluate_large_best_configs_on_small(args, autohf)
--- a/test/hf/test_mobilebert.py
+++ b/test/hf/test_mobilebert.py
@@ -0,0 +1,62 @@
+'''Require: pip install torch transformers datasets wandb flaml[blendsearch,ray]
+'''
+# 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
+
+global azure_log_path
+global azure_key
+
+
+def get_preparedata_setting(jobid_config):
+    preparedata_setting = {
+        "server_name": "tmdev",
+        "data_root_path": "data/",
+        "max_seq_length": 128,
+        "jobid_config": jobid_config,
+        "resplit_portion": {"source": ["train", "validation"],
+                            "train": [0, 0.8],
+                            "validation": [0.8, 0.9],
+                            "test": [0.9, 1.0]}
+    }
+    return preparedata_setting
+
+
+def get_autohf_settings():
+    autohf_settings = {"resources_per_trial": {"cpu": 1},
+                       "num_samples": 1,
+                       "time_budget": 100000,
+                       "ckpt_per_epoch": 1,
+                       "fp16": False,
+                       }
+    return autohf_settings
+
+
+def test_hpo():
+    try:
+        import ray
+    except ImportError:
+        return
+
+    from flaml.nlp import AutoTransformers
+    from flaml.nlp import JobID
+
+    jobid_config = JobID()
+    jobid_config.set_unittest_config()
+    autohf = AutoTransformers()
+
+    try:
+        preparedata_setting = get_preparedata_setting(jobid_config)
+        autohf.prepare_data(**preparedata_setting)
+
+        autohf_settings = get_autohf_settings()
+        validation_metric, analysis = autohf.fit(**autohf_settings, )
+
+        predictions, test_metric = autohf.predict()
+        if test_metric:
+            validation_metric.update({"test": test_metric})
+
+    except AssertionError:
+        pass
+
+
+if __name__ == "__main__":
+    test_hpo()