diff --git a/flaml/automl.py b/flaml/automl.py
index 91eaf2215..d6f23ce1a 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -1546,11 +1546,12 @@ class AutoML(BaseEstimator):
         return points
 
     @property
-    def prune_attr(self) -> Optional[str]:
-        """Attribute for pruning
+    def resource_attr(self) -> Optional[str]:
+        """Attribute of the resource dimension.
 
         Returns:
-            A string for the sample size attribute or None
+            A string for the sample size attribute
+            (the resource attribute in AutoML) or None.
         """
         return "FLAML_sample_size" if self._sample else None
 
@@ -2178,7 +2179,7 @@ class AutoML(BaseEstimator):
                 low_cost_partial_config=self.low_cost_partial_config,
                 points_to_evaluate=self.points_to_evaluate,
                 cat_hp_cost=self.cat_hp_cost,
-                prune_attr=self.prune_attr,
+                resource_attr=self.resource_attr,
                 min_resource=self.min_resource,
                 max_resource=self.max_resource,
                 config_constraints=[
@@ -2326,11 +2327,11 @@ class AutoML(BaseEstimator):
                 )
                 search_space = search_state.search_space
                 if self._sample:
-                    prune_attr = "FLAML_sample_size"
+                    resource_attr = "FLAML_sample_size"
                     min_resource = self._min_sample_size
                     max_resource = self._state.data_size[0]
                 else:
-                    prune_attr = min_resource = max_resource = None
+                    resource_attr = min_resource = max_resource = None
                 learner_class = self._state.learner_classes.get(estimator)
                 if "grid" == self._hpo_method:  # for synthetic exp only
                     points_to_evaluate = []
@@ -2362,7 +2363,7 @@ class AutoML(BaseEstimator):
                         points_to_evaluate=points_to_evaluate,
                         low_cost_partial_config=low_cost_partial_config,
                         cat_hp_cost=search_state.cat_hp_cost,
-                        prune_attr=prune_attr,
+                        resource_attr=resource_attr,
                         min_resource=min_resource,
                         max_resource=max_resource,
                         config_constraints=[
diff --git a/flaml/searcher/blendsearch.py b/flaml/searcher/blendsearch.py
index 147d1c055..9de56c203 100644
--- a/flaml/searcher/blendsearch.py
+++ b/flaml/searcher/blendsearch.py
@@ -45,7 +45,7 @@ class BlendSearch(Searcher):
         evaluated_rewards: Optional[List] = None,
         time_budget_s: Union[int, float] = None,
         num_samples: Optional[int] = None,
-        prune_attr: Optional[str] = None,
+        resource_attr: Optional[str] = None,
         min_resource: Optional[float] = None,
         max_resource: Optional[float] = None,
         reduction_factor: Optional[float] = None,
@@ -91,17 +91,10 @@ class BlendSearch(Searcher):
                 points_to_evaluate.
             time_budget_s: int or float | Time budget in seconds.
             num_samples: int | The number of configs to try.
-            prune_attr: A string of the attribute used for pruning.
-                Not necessarily in space.
-                When prune_attr is in space, it is a hyperparameter, e.g.,
-                    'n_iters', and the best value is unknown.
-                When prune_attr is not in space, it is a resource dimension,
-                    e.g., 'sample_size', and the peak performance is assumed
-                    to be at the max_resource.
-            min_resource: A float of the minimal resource to use for the
-                prune_attr; only valid if prune_attr is not in space.
-            max_resource: A float of the maximal resource to use for the
-                prune_attr; only valid if prune_attr is not in space.
+            resource_attr: A string to specify the resource dimension and the best
+                performance is assumed to be at the max_resource.
+            min_resource: A float of the minimal resource to use for the resource_attr.
+            max_resource: A float of the maximal resource to use for the resource_attr.
             reduction_factor: A float of the reduction factor used for
                 incremental pruning.
             global_search_alg: A Searcher instance as the global search
@@ -160,7 +153,7 @@ class BlendSearch(Searcher):
             metric,
             mode,
             space,
-            prune_attr,
+            resource_attr,
             min_resource,
             max_resource,
             reduction_factor,
@@ -409,7 +402,7 @@ class BlendSearch(Searcher):
                 if (objective - self._metric_target) * self._ls.metric_op < 0:
                     self._metric_target = objective
                     if self._ls.resource:
-                        self._best_resource = config[self._ls.prune_attr]
+                        self._best_resource = config[self._ls.resource_attr]
                 if thread_id:
                     if not self._metric_constraint_satisfied:
                         # no point has been found to satisfy metric constraint
@@ -637,7 +630,7 @@ class BlendSearch(Searcher):
             #     return None
             config = self._search_thread_pool[choice].suggest(trial_id)
             if not choice and config is not None and self._ls.resource:
-                config[self._ls.prune_attr] = self.best_resource
+                config[self._ls.resource_attr] = self.best_resource
             elif choice and config is None:
                 # local search thread finishes
                 if self._search_thread_pool[choice].converged:
@@ -975,7 +968,7 @@ class BlendSearchTuner(BlendSearch, NNITuner):
             self._ls.metric,
             self._mode,
             config,
-            self._ls.prune_attr,
+            self._ls.resource_attr,
             self._ls.min_resource,
             self._ls.max_resource,
             self._ls.resource_multiple_factor,
diff --git a/flaml/searcher/flow2.py b/flaml/searcher/flow2.py
index b881dadc4..b1e69c9b8 100644
--- a/flaml/searcher/flow2.py
+++ b/flaml/searcher/flow2.py
@@ -39,7 +39,7 @@ class FLOW2(Searcher):
         metric: Optional[str] = None,
         mode: Optional[str] = None,
         space: Optional[dict] = None,
-        prune_attr: Optional[str] = None,
+        resource_attr: Optional[str] = None,
         min_resource: Optional[float] = None,
         max_resource: Optional[float] = None,
         resource_multiple_factor: Optional[float] = 4,
@@ -67,17 +67,10 @@ class FLOW2(Searcher):
                 i.e., the relative cost of the
                 three choices of 'tree_method' is 1, 1 and 2 respectively.
             space: A dictionary to specify the search space.
-            prune_attr: A string of the attribute used for pruning.
-                Not necessarily in space.
-                When prune_attr is in space, it is a hyperparameter, e.g.,
-                    'n_iters', and the best value is unknown.
-                When prune_attr is not in space, it is a resource dimension,
-                    e.g., 'sample_size', and the peak performance is assumed
-                    to be at the max_resource.
-            min_resource: A float of the minimal resource to use for the
-                prune_attr; only valid if prune_attr is not in space.
-            max_resource: A float of the maximal resource to use for the
-                prune_attr; only valid if prune_attr is not in space.
+            resource_attr: A string to specify the resource dimension and the best
+                performance is assumed to be at the max_resource.
+            min_resource: A float of the minimal resource to use for the resource_attr.
+            max_resource: A float of the maximal resource to use for the resource_attr.
             resource_multiple_factor: A float of the multiplicative factor
                 used for increasing resource.
             cost_attr: A string of the attribute used for cost.
@@ -100,7 +93,7 @@ class FLOW2(Searcher):
         self.seed = seed
         self.init_config = init_config
         self.best_config = flatten_dict(init_config)
-        self.prune_attr = prune_attr
+        self.resource_attr = resource_attr
         self.min_resource = min_resource
         self.resource_multiple_factor = resource_multiple_factor or 4
         self.cost_attr = cost_attr
@@ -148,11 +141,15 @@ class FLOW2(Searcher):
         if not hier:
             self._space_keys = sorted(self._tunable_keys)
         self.hierarchical = hier
-        if self.prune_attr and self.prune_attr not in self._space and self.max_resource:
+        if (
+            self.resource_attr
+            and self.resource_attr not in self._space
+            and self.max_resource
+        ):
             self.min_resource = self.min_resource or self._min_resource()
             self._resource = self._round(self.min_resource)
             if not hier:
-                self._space_keys.append(self.prune_attr)
+                self._space_keys.append(self.resource_attr)
         else:
             self._resource = None
         self.incumbent = {}
@@ -252,7 +249,7 @@ class FLOW2(Searcher):
         if partial_config == self.init_config:
             self._reset_times += 1
         if self._resource:
-            config[self.prune_attr] = self.min_resource
+            config[self.resource_attr] = self.min_resource
         return config, space
 
     def create(
@@ -264,7 +261,7 @@ class FLOW2(Searcher):
             self.metric,
             self.mode,
             space,
-            self.prune_attr,
+            self.resource_attr,
             self.min_resource,
             self.max_resource,
             self.resource_multiple_factor,
@@ -328,7 +325,7 @@ class FLOW2(Searcher):
                     self.incumbent = self.normalize(self.best_config)
                     self.cost_incumbent = result.get(self.cost_attr)
                     if self._resource:
-                        self._resource = self.best_config[self.prune_attr]
+                        self._resource = self.best_config[self.resource_attr]
                     self._num_complete4incumbent = 0
                     self._cost_complete4incumbent = 0
                     self._num_proposedby_incumbent = 0
@@ -377,7 +374,7 @@ class FLOW2(Searcher):
                     if self.best_config != config:
                         self.best_config = config
                         if self._resource:
-                            self._resource = config[self.prune_attr]
+                            self._resource = config[self.resource_attr]
                         self.incumbent = self.normalize(self.best_config)
                         self.cost_incumbent = result.get(self.cost_attr)
                         self._cost_complete4incumbent = 0
@@ -495,18 +492,18 @@ class FLOW2(Searcher):
         self._resource = self._round(self._resource * self.resource_multiple_factor)
         self.cost_incumbent *= self._resource / old_resource
         config = self.best_config.copy()
-        config[self.prune_attr] = self._resource
+        config[self.resource_attr] = self._resource
         self._direction_tried = None
         self._configs[trial_id] = (config, self.step)
         return unflatten_dict(config)
 
     def _project(self, config):
-        """project normalized config in the feasible region and set prune_attr"""
+        """project normalized config in the feasible region and set resource_attr"""
         for key in self._bounded_keys:
             value = config[key]
             config[key] = max(0, min(1, value))
         if self._resource:
-            config[self.prune_attr] = self._resource
+            config[self.resource_attr] = self._resource
 
     @property
     def can_suggest(self) -> bool:
@@ -525,7 +522,7 @@ class FLOW2(Searcher):
         keys = sorted(config.keys()) if self.hierarchical else self._space_keys
         for key in keys:
             value = config[key]
-            if key == self.prune_attr:
+            if key == self.resource_attr:
                 value_list.append(value)
             else:
                 # key must be in space
@@ -556,7 +553,7 @@ class FLOW2(Searcher):
         """whether the incumbent can reach the incumbent of other."""
         config1, config2 = self.best_config, other.best_config
         incumbent1, incumbent2 = self.incumbent, other.incumbent
-        if self._resource and config1[self.prune_attr] > config2[self.prune_attr]:
+        if self._resource and config1[self.resource_attr] > config2[self.resource_attr]:
             # resource will not decrease
             return False
         for key in self._unordered_cat_hp:
diff --git a/flaml/tune/space.py b/flaml/tune/space.py
index 91fe08868..d050eed1a 100644
--- a/flaml/tune/space.py
+++ b/flaml/tune/space.py
@@ -247,7 +247,7 @@ def normalize(
     config_norm = {}
     for key, value in config.items():
         domain = space.get(key)
-        if domain is None:  # e.g., prune_attr
+        if domain is None:  # e.g., resource_attr
             config_norm[key] = value
             continue
         if not callable(getattr(domain, "get_sampler", None)):
@@ -405,7 +405,7 @@ def denormalize(
                 # Handle int (4.6 -> 5)
                 if isinstance(domain, sample.Integer):
                     config_denorm[key] = int(round(config_denorm[key]))
-        else:  # prune_attr
+        else:  # resource_attr
             config_denorm[key] = value
     return config_denorm
 
diff --git a/flaml/tune/tune.py b/flaml/tune/tune.py
index c98c3bc6b..b7726453a 100644
--- a/flaml/tune/tune.py
+++ b/flaml/tune/tune.py
@@ -17,6 +17,7 @@ try:
 except (ImportError, AssertionError):
     ray_import = False
     from .analysis import ExperimentAnalysis as EA
+
 from .result import DEFAULT_METRIC
 import logging
 
@@ -117,11 +118,11 @@ def run(
     time_budget_s: Union[int, float] = None,
     points_to_evaluate: Optional[List[dict]] = None,
     evaluated_rewards: Optional[List] = None,
-    prune_attr: Optional[str] = None,
+    resource_attr: Optional[str] = None,
     min_resource: Optional[float] = None,
     max_resource: Optional[float] = None,
     reduction_factor: Optional[float] = None,
-    report_intermediate_result: Optional[bool] = False,
+    scheduler: Optional = None,
     search_alg=None,
     verbose: Optional[int] = 2,
     local_dir: Optional[str] = None,
@@ -205,21 +206,29 @@ def run(
             points_to_evaluate are 3.0 and 1.0 respectively and want to
             inform run()
 
-        prune_attr: A string of the attribute used for pruning.
-            Not necessarily in space.
-            When prune_attr is in space, it is a hyperparameter, e.g.,
-            'n_iters', and the best value is unknown.
-            When prune_attr is not in space, it is a resource dimension,
-            e.g., 'sample_size', and the peak performance is assumed
-            to be at the max_resource.
-        min_resource: A float of the minimal resource to use for the
-            prune_attr; only valid if prune_attr is not in space.
-        max_resource: A float of the maximal resource to use for the
-            prune_attr; only valid if prune_attr is not in space.
+        resource_attr: A string to specify the resource dimension used by
+            the scheduler via "scheduler".
+        min_resource: A float of the minimal resource to use for the resource_attr.
+        max_resource: A float of the maximal resource to use for the resource_attr.
         reduction_factor: A float of the reduction factor used for incremental
             pruning.
-        report_intermediate_result: A boolean of whether intermediate results
-            are reported. If so, early stopping and pruning can be used.
+        scheduler: A scheduler for executing the experiment. Can be None, 'flaml',
+            'asha' or a custom instance of the TrialScheduler class. Default is None:
+            in this case when resource_attr is provided, the 'flaml' scheduler will be
+            used, otherwise no scheduler will be used. When set 'flaml', an
+            authentic scheduler implemented in FLAML will be used. It does not
+            require users to report intermediate results in training_function.
+            Find more details abuot this scheduler in this paper
+            https://arxiv.org/pdf/1911.04706.pdf).
+            When set 'asha', the input for arguments "resource_attr",
+            "min_resource", "max_resource" and "reduction_factor" will be passed
+            to ASHA's "time_attr",  "max_t", "grace_period" and "reduction_factor"
+            respectively. You can also provide a self-defined scheduler instance
+            of the TrialScheduler class. When 'asha' or self-defined scheduler is
+            used, you usually need to report intermediate results in the training
+            function. Please find examples using different types of schedulers
+            and how to set up the corresponding training functions in
+            test/tune/test_scheduler.py. TODO: point to notebook examples.
         search_alg: An instance of BlendSearch as the search algorithm
             to be used. The same instance can be used for iterative tuning.
             e.g.,
@@ -295,6 +304,20 @@ def run(
     from ..searcher.blendsearch import BlendSearch
 
     if search_alg is None:
+        flaml_scheduler_resource_attr = (
+            flaml_scheduler_min_resource
+        ) = flaml_scheduler_max_resource = flaml_scheduler_reduction_factor = None
+        if scheduler in (None, "flaml"):
+
+            # when scheduler is set 'flaml', we will use a scheduler that is
+            # authentic to the search algorithms in flaml. After setting up
+            # the search algorithm accordingly, we need to set scheduler to
+            # None in case it is later used in the trial runner.
+            flaml_scheduler_resource_attr = resource_attr
+            flaml_scheduler_min_resource = min_resource
+            flaml_scheduler_max_resource = max_resource
+            flaml_scheduler_reduction_factor = reduction_factor
+            scheduler = None
         search_alg = BlendSearch(
             metric=metric or DEFAULT_METRIC,
             mode=mode,
@@ -305,10 +328,10 @@ def run(
             cat_hp_cost=cat_hp_cost,
             time_budget_s=time_budget_s,
             num_samples=num_samples,
-            prune_attr=prune_attr,
-            min_resource=min_resource,
-            max_resource=max_resource,
-            reduction_factor=reduction_factor,
+            resource_attr=flaml_scheduler_resource_attr,
+            min_resource=flaml_scheduler_min_resource,
+            max_resource=flaml_scheduler_max_resource,
+            reduction_factor=flaml_scheduler_reduction_factor,
             config_constraints=config_constraints,
             metric_constraints=metric_constraints,
         )
@@ -334,12 +357,11 @@ def run(
             searcher.set_search_properties(metric, mode, config, setting)
         else:
             searcher.set_search_properties(metric, mode, config)
-    scheduler = None
-    if report_intermediate_result:
+    if scheduler == "asha":
         params = {}
-        # scheduler resource_dimension=prune_attr
-        if prune_attr:
-            params["time_attr"] = prune_attr
+        # scheduler resource_dimension=resource_attr
+        if resource_attr:
+            params["time_attr"] = resource_attr
         if max_resource:
             params["max_t"] = max_resource
         if min_resource:
diff --git a/flaml/version.py b/flaml/version.py
index deded3247..3e2f46a3a 100644
--- a/flaml/version.py
+++ b/flaml/version.py
@@ -1 +1 @@
-__version__ = "0.8.2"
+__version__ = "0.9.0"
diff --git a/notebook/flaml_finetune_transformer.ipynb b/notebook/flaml_finetune_transformer.ipynb
index 01718b9cc..beae8aed3 100644
--- a/notebook/flaml_finetune_transformer.ipynb
+++ b/notebook/flaml_finetune_transformer.ipynb
@@ -29,7 +29,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -38,7 +38,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -47,7 +47,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -56,18 +56,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
-     "output_type": "execute_result",
      "data": {
       "text/plain": [
        "{'input_ids': [101, 2023, 2003, 1037, 3231, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}"
       ]
      },
+     "execution_count": 4,
      "metadata": {},
-     "execution_count": 5
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -83,7 +83,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -92,7 +92,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -101,14 +101,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
-     "output_type": "stream",
      "name": "stderr",
+     "output_type": "stream",
      "text": [
-      "Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n"
+      "Reusing dataset glue (/home/ec2-user/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n"
      ]
     }
    ],
@@ -118,7 +118,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -130,62 +130,68 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
-     "output_type": "display_data",
      "data": {
-      "text/plain": "HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))",
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0dcf9ca8ce024a2b832606a6a3219b17",
        "version_major": 2,
-       "version_minor": 0,
-       "model_id": "ecc66e6795f848e0a41e6cf1ce37bdf2"
-      }
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))"
+      ]
      },
-     "metadata": {}
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-     "output_type": "stream",
      "name": "stdout",
+     "output_type": "stream",
      "text": [
       "\n"
      ]
     },
     {
-     "output_type": "display_data",
      "data": {
-      "text/plain": "HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))",
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c58845729f0a4261830ad679891e7c77",
        "version_major": 2,
-       "version_minor": 0,
-       "model_id": "2d33fc70b80b403080ad8c0e77ed1891"
-      }
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))"
+      ]
      },
-     "metadata": {}
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-     "output_type": "stream",
      "name": "stdout",
+     "output_type": "stream",
      "text": [
       "\n"
      ]
     },
     {
-     "output_type": "display_data",
      "data": {
-      "text/plain": "HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))",
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9716d177a40748008cc6089e3d52a1d5",
        "version_major": 2,
-       "version_minor": 0,
-       "model_id": "d2ab3feb1a354187abb2dded0ead404f"
-      }
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))"
+      ]
      },
-     "metadata": {}
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-     "output_type": "stream",
      "name": "stdout",
+     "output_type": "stream",
      "text": [
       "\n"
      ]
@@ -197,11 +203,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
-     "output_type": "execute_result",
      "data": {
       "text/plain": [
        "{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
@@ -229,8 +234,9 @@
        " 'sentence': \"Our friends won't buy this analysis, let alone the next one we propose.\"}"
       ]
      },
+     "execution_count": 10,
      "metadata": {},
-     "execution_count": 11
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -246,7 +252,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -255,12 +261,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
-     "output_type": "stream",
      "name": "stderr",
+     "output_type": "stream",
      "text": [
       "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
       "- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
@@ -277,11 +283,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
-     "output_type": "execute_result",
      "data": {
       "text/plain": [
        "DistilBertForSequenceClassification(\n",
@@ -399,8 +404,9 @@
        ")"
       ]
      },
+     "execution_count": 13,
      "metadata": {},
-     "execution_count": 14
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -416,7 +422,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -425,11 +431,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
-     "output_type": "execute_result",
      "data": {
       "text/plain": [
        "Metric(name: \"glue\", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: \"\"\"\n",
@@ -477,8 +482,9 @@
        "\"\"\", stored examples: 0)"
       ]
      },
+     "execution_count": 15,
      "metadata": {},
-     "execution_count": 16
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -487,7 +493,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -507,7 +513,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -517,7 +523,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -529,7 +535,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -545,40 +551,70 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
+     "name": "stdout",
      "output_type": "stream",
-     "name": "stderr",
      "text": [
-      "/home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n  warnings.warn('Was asked to gather along dimension 0, but all '\n"
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
      ]
     },
     {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "\n    <div>\n        <style>\n            /* Turns off some styling */\n            progress {\n                /* gets rid of default border in Firefox and Opera. */\n                border: none;\n                /* Needs to be in here for Safari polyfill so background images work as expected. */\n                background-size: auto;\n            }\n        </style>\n      \n      <progress value='2' max='804' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [  2/804 : < :, Epoch 0.00/3]\n    </div>\n    <table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: left;\">\n      <th>Step</th>\n      <th>Training Loss</th>\n    </tr>\n  </thead>\n  <tbody>\n  </tbody>\n</table><p>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stderr",
-     "text": [
-      "/home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n  warnings.warn('Was asked to gather along dimension 0, but all '\n"
-     ]
-    },
-    {
-     "output_type": "execute_result",
      "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "        <style>\n",
+       "            /* Turns off some styling */\n",
+       "            progress {\n",
+       "                /* gets rid of default border in Firefox and Opera. */\n",
+       "                border: none;\n",
+       "                /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
+       "                background-size: auto;\n",
+       "            }\n",
+       "        </style>\n",
+       "      \n",
+       "      <progress value='1591' max='3207' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [1591/3207 1:03:06 < 1:04:11, 0.42 it/s, Epoch 1.49/3]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>500</td>\n",
+       "      <td>0.571000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1000</td>\n",
+       "      <td>0.515400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1500</td>\n",
+       "      <td>0.356100</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
       "text/plain": [
-       "TrainOutput(global_step=804, training_loss=0.3209413462017306, metrics={'train_runtime': 115.5328, 'train_samples_per_second': 6.959, 'total_flos': 238363718990580.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 2336600064, 'init_mem_gpu_alloc_delta': 268953088, 'init_mem_cpu_peaked_delta': 257929216, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 2381066240, 'train_mem_gpu_alloc_delta': 806788096, 'train_mem_cpu_peaked_delta': 186974208, 'train_mem_gpu_peaked_delta': 550790144})"
+       "<IPython.core.display.HTML object>"
       ]
      },
      "metadata": {},
-     "execution_count": 21
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -586,6 +622,8 @@
    ]
   },
   {
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
     "## Hyperparameter Optimization\n",
     "\n",
@@ -595,13 +633,11 @@
     "### Step 1. Define training method\n",
     "\n",
     "We define a function `train_distilbert(config: dict)` that accepts a hyperparameter configuration dict `config`. The specific configs will be generated by flaml's search algorithm in a given search space.\n"
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -656,6 +692,8 @@
    ]
   },
   {
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
     "### Step 2. Define the search\n",
     "\n",
@@ -664,13 +702,11 @@
     "- The `search_space` for our hyperparameters\n",
     "- The metric and the mode ('max' or 'min') for optimization\n",
     "- The constraints (`n_cpus`, `n_gpus`, `num_samples`, and `time_budget_s`)"
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -687,7 +723,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -704,809 +740,141 @@
    ]
   },
   {
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
     "### Step 3. Launch with `flaml.tune.run`\n",
     "\n",
     "We are now ready to launch the tuning using `flaml.tune.run`:"
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
+     "name": "stdout",
      "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
      "name": "stderr",
+     "output_type": "stream",
      "text": [
-      "2021-05-07 02:35:57,130\tINFO services.py:1172 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n",
-      "2021-05-07 02:35:58,044\tWARNING function_runner.py:540 -- Function checkpointing is disabled. This may result in unexpected behavior when using checkpointing features or certain schedulers. To enable, set the train function arguments to be `func(config, checkpoint_dir=None)`.\n",
-      "Tuning started...\n"
+      "/home/ec2-user/miniconda3/envs/myflaml/lib/python3.8/site-packages/ray/_private/services.py:238: UserWarning: Not all Ray Dashboard dependencies were found. To use the dashboard please install Ray using `pip install ray[default]`. To disable this message, set RAY_DISABLE_IMPORT_WARNING env var to '1'.\n",
+      "  warnings.warn(warning_message)\n",
+      "2021-12-01 23:35:54,348\tWARNING function_runner.py:558 -- Function checkpointing is disabled. This may result in unexpected behavior when using checkpointing features or certain schedulers. To enable, set the train function arguments to be `func(config, checkpoint_dir=None)`.\n"
      ]
     },
     {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 26.0/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 1/infinite (1 RUNNING)<br><br>"
-     },
-     "metadata": {}
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tuning started...\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "== Status ==<br>Memory usage on this node: 4.3/7.7 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4.0/4 CPUs, 4.0/4 GPUs, 0.0/2.34 GiB heap, 0.0/1.17 GiB objects<br>Result logdir: /home/ec2-user/FLAML/notebook/logs/train_distilbert_2021-12-01_23-35-54<br>Number of trials: 1/infinite (1 RUNNING)<br><br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "== Status ==<br>Memory usage on this node: 4.5/7.7 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4.0/4 CPUs, 4.0/4 GPUs, 0.0/2.34 GiB heap, 0.0/1.17 GiB objects<br>Result logdir: /home/ec2-user/FLAML/notebook/logs/train_distilbert_2021-12-01_23-35-54<br>Number of trials: 2/infinite (1 PENDING, 1 RUNNING)<br><br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "== Status ==<br>Memory usage on this node: 4.6/7.7 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4.0/4 CPUs, 4.0/4 GPUs, 0.0/2.34 GiB heap, 0.0/1.17 GiB objects<br>Result logdir: /home/ec2-user/FLAML/notebook/logs/train_distilbert_2021-12-01_23-35-54<br>Number of trials: 2/infinite (1 PENDING, 1 RUNNING)<br><br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-     "output_type": "stream",
      "name": "stderr",
+     "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(pid=886303)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
+      "\u001b[2m\u001b[36m(pid=11344)\u001b[0m Reusing dataset glue (/home/ec2-user/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
       "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 78%|███████▊  | 7/9 [00:00<00:00, 62.07ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 40.87ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 107.60ba/s]\n",
+      " 22%|██▏       | 2/9 [00:00<00:00, 19.41ba/s]\n",
+      " 56%|█████▌    | 5/9 [00:00<00:00, 20.98ba/s]\n",
+      " 89%|████████▉ | 8/9 [00:00<00:00, 21.75ba/s]\n",
+      "100%|██████████| 9/9 [00:00<00:00, 24.49ba/s]\n",
+      "100%|██████████| 2/2 [00:00<00:00, 42.79ba/s]\n",
       "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 105.70ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=886303)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=886303)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=886303)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=886303)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=886303)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=886303)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=886303)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=886303)\u001b[0m {'train_runtime': 45.5778, 'train_samples_per_second': 5.88, 'epoch': 1.0}\n",
-      "\u001b[2m\u001b[36m(pid=886303)\u001b[0m {'eval_loss': 0.5879864692687988, 'eval_matthews_correlation': 0.0, 'eval_runtime': 1.7063, 'eval_samples_per_second': 611.265, 'epoch': 1.0}\n",
-      "Trial train_distilbert_a0c303d0 reported loss=0.5879864692687988,matthews_correlation=0.0 with parameters={'num_train_epochs': 1, 'learning_rate': 5.61151641533451e-06, 'adam_epsilon': 7.969454818643929e-08, 'adam_beta1': 0.9390788489441669, 'adam_beta2': 0.99186521389353}.\n",
-      "\u001b[2m\u001b[36m(pid=886303)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:873: RuntimeWarning: invalid value encountered in double_scalars\n",
-      "\u001b[2m\u001b[36m(pid=886303)\u001b[0m   mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n"
+      "100%|██████████| 2/2 [00:00<00:00, 41.48ba/s]\n",
+      "\u001b[2m\u001b[36m(pid=11344)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+      "\u001b[2m\u001b[36m(pid=11344)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "\u001b[2m\u001b[36m(pid=11344)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "\u001b[2m\u001b[36m(pid=11344)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+      "\u001b[2m\u001b[36m(pid=11344)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
      ]
     },
     {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 30.9/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 2/infinite (1 PENDING, 1 RUNNING)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
      "name": "stdout",
-     "text": [
-      "Trial train_distilbert_a0c303d0 completed. Last result: loss=0.5879864692687988,matthews_correlation=0.0\n",
-      "\u001b[2m\u001b[36m(pid=886302)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 78%|███████▊  | 7/9 [00:00<00:00, 61.83ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 41.19ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 108.40ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 104.85ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=886302)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=886302)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=886302)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=886302)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=886302)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=886302)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=886302)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=886302)\u001b[0m {'train_runtime': 62.1006, 'train_samples_per_second': 6.248, 'epoch': 1.45}\n",
-      "\u001b[2m\u001b[36m(pid=886302)\u001b[0m {'eval_loss': 0.6030182838439941, 'eval_matthews_correlation': 0.0, 'eval_runtime': 1.7026, 'eval_samples_per_second': 612.584, 'epoch': 1.45}\n",
-      "Trial train_distilbert_a0c303d1 reported loss=0.6030182838439941,matthews_correlation=0.0 with parameters={'num_train_epochs': 1.444265389543504, 'learning_rate': 2.051338263087453e-06, 'adam_epsilon': 2.0511104188434023e-09, 'adam_beta1': 0.8110358863119579, 'adam_beta2': 0.997213662958137}.\n",
-      "\u001b[2m\u001b[36m(pid=886302)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:873: RuntimeWarning: invalid value encountered in double_scalars\n",
-      "\u001b[2m\u001b[36m(pid=886302)\u001b[0m   mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 31.2/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 3/infinite (1 PENDING, 1 RUNNING, 1 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
      "output_type": "stream",
-     "name": "stdout",
      "text": [
-      "Trial train_distilbert_a0c303d1 completed. Last result: loss=0.6030182838439941,matthews_correlation=0.0\n",
-      "\u001b[2m\u001b[36m(pid=886305)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 67%|██████▋   | 6/9 [00:00<00:00, 56.45ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 39.00ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 112.51ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 106.76ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=886305)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=886305)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=886305)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=886305)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=886305)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=886305)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=886305)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=886305)\u001b[0m {'train_runtime': 44.0366, 'train_samples_per_second': 6.086, 'epoch': 1.0}\n",
-      "\u001b[2m\u001b[36m(pid=886305)\u001b[0m {'eval_loss': 0.5865175724029541, 'eval_matthews_correlation': 0.0, 'eval_runtime': 1.6974, 'eval_samples_per_second': 614.462, 'epoch': 1.0}\n",
-      "Trial train_distilbert_c39b2ef0 reported loss=0.5865175724029541,matthews_correlation=0.0 with parameters={'num_train_epochs': 1.0, 'learning_rate': 5.265428651017862e-06, 'adam_epsilon': 1e-07, 'adam_beta1': 0.9093950363089345, 'adam_beta2': 0.9937145453421068}.\n",
-      "\u001b[2m\u001b[36m(pid=886305)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:873: RuntimeWarning: invalid value encountered in double_scalars\n",
-      "\u001b[2m\u001b[36m(pid=886305)\u001b[0m   mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 31.4/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 4/infinite (1 PENDING, 1 RUNNING, 2 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_c39b2ef0 completed. Last result: loss=0.5865175724029541,matthews_correlation=0.0\n",
-      "\u001b[2m\u001b[36m(pid=886304)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 67%|██████▋   | 6/9 [00:00<00:00, 59.27ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 40.35ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 114.16ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 92.98ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=886304)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=886304)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=886304)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=886304)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=886304)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=886304)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=886304)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=886304)\u001b[0m {'train_runtime': 47.2831, 'train_samples_per_second': 5.837, 'epoch': 1.03}\n",
-      "\u001b[2m\u001b[36m(pid=886304)\u001b[0m {'eval_loss': 0.5813134908676147, 'eval_matthews_correlation': 0.0, 'eval_runtime': 1.7102, 'eval_samples_per_second': 609.872, 'epoch': 1.03}\n",
-      "Trial train_distilbert_f00776e2 reported loss=0.5813134908676147,matthews_correlation=0.0 with parameters={'num_train_epochs': 1.027230096840913, 'learning_rate': 5.980351945986672e-06, 'adam_epsilon': 4.3894312769297216e-08, 'adam_beta1': 0.9687626615793994, 'adam_beta2': 0.9900193241041526}.\n",
-      "\u001b[2m\u001b[36m(pid=886304)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:873: RuntimeWarning: invalid value encountered in double_scalars\n",
-      "\u001b[2m\u001b[36m(pid=886304)\u001b[0m   mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 31.7/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 5/infinite (1 PENDING, 1 RUNNING, 3 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_f00776e2 completed. Last result: loss=0.5813134908676147,matthews_correlation=0.0\n",
-      "\u001b[2m\u001b[36m(pid=892770)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 67%|██████▋   | 6/9 [00:00<00:00, 58.13ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 39.40ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 92.35ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 106.15ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=892770)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=892770)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=892770)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=892770)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=892770)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=892770)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=892770)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=892770)\u001b[0m {'train_runtime': 44.3622, 'train_samples_per_second': 6.041, 'epoch': 1.0}\n",
-      "\u001b[2m\u001b[36m(pid=892770)\u001b[0m {'eval_loss': 0.5855756998062134, 'eval_matthews_correlation': 0.0, 'eval_runtime': 1.6326, 'eval_samples_per_second': 638.84, 'epoch': 1.0}\n",
-      "Trial train_distilbert_11ab3900 reported loss=0.5855756998062134,matthews_correlation=0.0 with parameters={'num_train_epochs': 1.0, 'learning_rate': 5.066078755222997e-06, 'adam_epsilon': 7.092964136440028e-08, 'adam_beta1': 0.9621979585561743, 'adam_beta2': 0.9918380608681451}.\n",
-      "\u001b[2m\u001b[36m(pid=892770)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:873: RuntimeWarning: invalid value encountered in double_scalars\n",
-      "\u001b[2m\u001b[36m(pid=892770)\u001b[0m   mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 32.0/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 6/infinite (1 PENDING, 1 RUNNING, 4 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_11ab3900 completed. Last result: loss=0.5855756998062134,matthews_correlation=0.0\n",
-      "\u001b[2m\u001b[36m(pid=897725)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 67%|██████▋   | 6/9 [00:00<00:00, 55.93ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 40.18ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 104.47ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 102.67ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=897725)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=897725)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=897725)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=897725)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=897725)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=897725)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=897725)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=897725)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=897725)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=897725)\u001b[0m {'train_runtime': 88.0772, 'train_samples_per_second': 6.562, 'epoch': 2.16}\n",
-      "\u001b[2m\u001b[36m(pid=897725)\u001b[0m {'eval_loss': 0.5316324830055237, 'eval_matthews_correlation': 0.38889272875750597, 'eval_runtime': 1.6116, 'eval_samples_per_second': 647.165, 'epoch': 2.16}\n",
-      "Trial train_distilbert_353025b6 reported loss=0.5316324830055237,matthews_correlation=0.38889272875750597 with parameters={'num_train_epochs': 2.1544304289135847, 'learning_rate': 6.215678437115527e-06, 'adam_epsilon': 8.954255073716448e-08, 'adam_beta1': 0.9159597393321596, 'adam_beta2': 0.9918923676622686}.\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 30.9/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 7/infinite (1 PENDING, 1 RUNNING, 5 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_353025b6 completed. Last result: loss=0.5316324830055237,matthews_correlation=0.38889272875750597\n",
-      "\u001b[2m\u001b[36m(pid=907288)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 78%|███████▊  | 7/9 [00:00<00:00, 60.41ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 40.27ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 107.10ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 93.66ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=907288)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=907288)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=907288)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=907288)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=907288)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=907288)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=907288)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=907288)\u001b[0m {'train_runtime': 45.7159, 'train_samples_per_second': 5.862, 'epoch': 1.0}\n",
-      "\u001b[2m\u001b[36m(pid=907288)\u001b[0m {'eval_loss': 0.5385054349899292, 'eval_matthews_correlation': 0.2805581766595423, 'eval_runtime': 1.6966, 'eval_samples_per_second': 614.762, 'epoch': 1.0}\n",
-      "Trial train_distilbert_5728a1de reported loss=0.5385054349899292,matthews_correlation=0.2805581766595423 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1.0090242363457245e-05, 'adam_epsilon': 1e-07, 'adam_beta1': 0.9269328215628503, 'adam_beta2': 0.9931456651827125}.\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 31.3/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 8/infinite (1 PENDING, 1 RUNNING, 6 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_5728a1de completed. Last result: loss=0.5385054349899292,matthews_correlation=0.2805581766595423\n",
-      "\u001b[2m\u001b[36m(pid=908756)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 78%|███████▊  | 7/9 [00:00<00:00, 60.71ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 40.09ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 96.21ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 89.91ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=908756)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=908756)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=908756)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=908756)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=908756)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=908756)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=908756)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=908756)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=908756)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=908756)\u001b[0m {'train_runtime': 111.6359, 'train_samples_per_second': 6.557, 'epoch': 2.73}\n",
-      "\u001b[2m\u001b[36m(pid=908756)\u001b[0m {'eval_loss': 0.5391769409179688, 'eval_matthews_correlation': 0.3272948213494272, 'eval_runtime': 1.7214, 'eval_samples_per_second': 605.887, 'epoch': 2.73}\n",
-      "Trial train_distilbert_9394c2e2 reported loss=0.5391769409179688,matthews_correlation=0.3272948213494272 with parameters={'num_train_epochs': 2.729346084540195, 'learning_rate': 3.456743686220407e-06, 'adam_epsilon': 4.499745343526232e-08, 'adam_beta1': 0.9281057667134762, 'adam_beta2': 0.9906135322351715}.\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 31.6/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 9/infinite (1 PENDING, 1 RUNNING, 7 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_9394c2e2 completed. Last result: loss=0.5391769409179688,matthews_correlation=0.3272948213494272\n",
-      "\u001b[2m\u001b[36m(pid=912284)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 78%|███████▊  | 7/9 [00:00<00:00, 67.17ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 43.92ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 92.79ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 113.54ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=912284)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=912284)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=912284)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=912284)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=912284)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=912284)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=912284)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=912284)\u001b[0m {'train_runtime': 66.0391, 'train_samples_per_second': 6.481, 'epoch': 1.6}\n",
-      "\u001b[2m\u001b[36m(pid=912284)\u001b[0m {'eval_loss': 0.5275164842605591, 'eval_matthews_correlation': 0.37917684067701946, 'eval_runtime': 1.8839, 'eval_samples_per_second': 553.629, 'epoch': 1.6}\n",
-      "Trial train_distilbert_b6543fec reported loss=0.5275164842605591,matthews_correlation=0.37917684067701946 with parameters={'num_train_epochs': 1.5953752206236405, 'learning_rate': 7.011758405605033e-06, 'adam_epsilon': 1e-07, 'adam_beta1': 0.8768956788597737, 'adam_beta2': 0.9920981141573957}.\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 31.9/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 10/infinite (1 PENDING, 1 RUNNING, 8 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_b6543fec completed. Last result: loss=0.5275164842605591,matthews_correlation=0.37917684067701946\n",
-      "\u001b[2m\u001b[36m(pid=914582)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 67%|██████▋   | 6/9 [00:00<00:00, 59.49ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 39.12ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 106.86ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 110.39ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=914582)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=914582)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=914582)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=914582)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=914582)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=914582)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=914582)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=914582)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=914582)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=914582)\u001b[0m {'train_runtime': 116.5658, 'train_samples_per_second': 6.692, 'epoch': 2.91}\n",
-      "\u001b[2m\u001b[36m(pid=914582)\u001b[0m {'eval_loss': 0.5162246823310852, 'eval_matthews_correlation': 0.417156672319181, 'eval_runtime': 1.6762, 'eval_samples_per_second': 622.252, 'epoch': 2.91}\n",
-      "Trial train_distilbert_0071f998 reported loss=0.5162246823310852,matthews_correlation=0.417156672319181 with parameters={'num_train_epochs': 2.9093911031251687, 'learning_rate': 5.509981405340389e-06, 'adam_epsilon': 7.397757073991268e-08, 'adam_beta1': 0.9550237998045454, 'adam_beta2': 0.9916866638359256}.\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 31.0/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 11/infinite (1 PENDING, 1 RUNNING, 9 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_0071f998 completed. Last result: loss=0.5162246823310852,matthews_correlation=0.417156672319181\n",
-      "\u001b[2m\u001b[36m(pid=918301)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 67%|██████▋   | 6/9 [00:00<00:00, 53.62ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 35.94ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 104.02ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 107.63ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=918301)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=918301)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=918301)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=918301)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=918301)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=918301)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=918301)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=918301)\u001b[0m {'train_runtime': 64.0869, 'train_samples_per_second': 6.413, 'epoch': 1.53}\n",
-      "\u001b[2m\u001b[36m(pid=918301)\u001b[0m {'eval_loss': 0.5516289472579956, 'eval_matthews_correlation': 0.06558874629318973, 'eval_runtime': 1.7231, 'eval_samples_per_second': 605.3, 'epoch': 1.53}\n",
-      "Trial train_distilbert_2f830be6 reported loss=0.5516289472579956,matthews_correlation=0.06558874629318973 with parameters={'num_train_epochs': 1.533382973758465, 'learning_rate': 4.376455192665657e-06, 'adam_epsilon': 7.612697083213253e-08, 'adam_beta1': 0.8869305298731159, 'adam_beta2': 0.9896280392288217}.\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 31.2/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 12/infinite (1 PENDING, 1 RUNNING, 10 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_2f830be6 completed. Last result: loss=0.5516289472579956,matthews_correlation=0.06558874629318973\n",
-      "\u001b[2m\u001b[36m(pid=920414)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 67%|██████▋   | 6/9 [00:00<00:00, 59.90ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 39.68ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 84.51ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 86.07ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=920414)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=920414)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=920414)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=920414)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=920414)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=920414)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=920414)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=920414)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=920414)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=920414)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=920414)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=920414)\u001b[0m {'train_runtime': 164.3921, 'train_samples_per_second': 6.667, 'epoch': 4.09}\n",
-      "\u001b[2m\u001b[36m(pid=920414)\u001b[0m {'eval_loss': 0.523731529712677, 'eval_matthews_correlation': 0.45354879777314566, 'eval_runtime': 1.6345, 'eval_samples_per_second': 638.117, 'epoch': 4.09}\n",
-      "Trial train_distilbert_7ce03f12 reported loss=0.523731529712677,matthews_correlation=0.45354879777314566 with parameters={'num_train_epochs': 4.087746394379008, 'learning_rate': 7.82557368974717e-06, 'adam_epsilon': 8.701436966404051e-08, 'adam_beta1': 0.9840530092635891, 'adam_beta2': 0.9939557025262034}.\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 31.7/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 13/infinite (1 PENDING, 1 RUNNING, 11 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_7ce03f12 completed. Last result: loss=0.523731529712677,matthews_correlation=0.45354879777314566\n",
-      "\u001b[2m\u001b[36m(pid=925520)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 78%|███████▊  | 7/9 [00:00<00:00, 63.59ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 41.23ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 102.78ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 107.25ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=925520)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=925520)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=925520)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=925520)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=925520)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=925520)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=925520)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=925520)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=925520)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=925520)\u001b[0m {'train_runtime': 135.9594, 'train_samples_per_second': 6.708, 'epoch': 3.4}\n",
-      "\u001b[2m\u001b[36m(pid=925520)\u001b[0m {'eval_loss': 0.5112878680229187, 'eval_matthews_correlation': 0.4508496945113286, 'eval_runtime': 1.553, 'eval_samples_per_second': 671.602, 'epoch': 3.4}\n",
-      "Trial train_distilbert_aaab0508 reported loss=0.5112878680229187,matthews_correlation=0.4508496945113286 with parameters={'num_train_epochs': 3.402431917111274, 'learning_rate': 8.91979274640535e-06, 'adam_epsilon': 1e-07, 'adam_beta1': 0.9407072906865396, 'adam_beta2': 0.993946172640627}.\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 32.3/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 14/infinite (1 PENDING, 1 RUNNING, 12 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_aaab0508 completed. Last result: loss=0.5112878680229187,matthews_correlation=0.4508496945113286\n",
-      "\u001b[2m\u001b[36m(pid=929827)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 67%|██████▋   | 6/9 [00:00<00:00, 57.22ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 38.97ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 90.88ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 89.31ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=929827)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=929827)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=929827)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=929827)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=929827)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=929827)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=929827)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=929827)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=929827)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=929827)\u001b[0m {'train_runtime': 142.1621, 'train_samples_per_second': 6.591, 'epoch': 3.5}\n",
-      "\u001b[2m\u001b[36m(pid=929827)\u001b[0m {'eval_loss': 0.5350601673126221, 'eval_matthews_correlation': 0.40085080763525827, 'eval_runtime': 1.7316, 'eval_samples_per_second': 602.346, 'epoch': 3.5}\n",
-      "Trial train_distilbert_14262454 reported loss=0.5350601673126221,matthews_correlation=0.40085080763525827 with parameters={'num_train_epochs': 3.495397786456084, 'learning_rate': 4.834054640339551e-06, 'adam_epsilon': 4.600931186189709e-08, 'adam_beta1': 0.99, 'adam_beta2': 0.9916961720574915}.\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 31.2/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 15/infinite (1 PENDING, 1 RUNNING, 13 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_14262454 completed. Last result: loss=0.5350601673126221,matthews_correlation=0.40085080763525827\n",
-      "\u001b[2m\u001b[36m(pid=934238)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 67%|██████▋   | 6/9 [00:00<00:00, 53.04ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 37.06ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 106.60ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 90.49ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=934238)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=934238)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=934238)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=934238)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=934238)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=934238)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=934238)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=934238)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=934238)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=934238)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=934238)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=934238)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=934238)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=934238)\u001b[0m {'train_runtime': 261.0489, 'train_samples_per_second': 6.83, 'epoch': 6.65}\n",
-      "\u001b[2m\u001b[36m(pid=934238)\u001b[0m {'eval_loss': 0.609851062297821, 'eval_matthews_correlation': 0.5268023551875569, 'eval_runtime': 1.7076, 'eval_samples_per_second': 610.801, 'epoch': 6.65}\n",
-      "Trial train_distilbert_6d211fe6 reported loss=0.609851062297821,matthews_correlation=0.5268023551875569 with parameters={'num_train_epochs': 6.649954795358705, 'learning_rate': 1.173331172454689e-05, 'adam_epsilon': 5.4079124274855485e-08, 'adam_beta1': 0.9592773086704482, 'adam_beta2': 0.9945564008561629}.\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 31.8/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 16/infinite (1 PENDING, 1 RUNNING, 14 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_6d211fe6 completed. Last result: loss=0.609851062297821,matthews_correlation=0.5268023551875569\n",
-      "\u001b[2m\u001b[36m(pid=942628)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 78%|███████▊  | 7/9 [00:00<00:00, 62.15ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 40.78ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 108.16ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 107.36ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=942628)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=942628)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=942628)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=942628)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=942628)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=942628)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=942628)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=942628)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=942628)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=942628)\u001b[0m {'train_runtime': 101.88, 'train_samples_per_second': 6.616, 'epoch': 2.51}\n",
-      "\u001b[2m\u001b[36m(pid=942628)\u001b[0m {'eval_loss': 0.5422758460044861, 'eval_matthews_correlation': 0.32496815807366203, 'eval_runtime': 1.7034, 'eval_samples_per_second': 612.32, 'epoch': 2.51}\n",
-      "Trial train_distilbert_c980bae4 reported loss=0.5422758460044861,matthews_correlation=0.32496815807366203 with parameters={'num_train_epochs': 2.512749499653892, 'learning_rate': 5.2192940076368766e-06, 'adam_epsilon': 1e-07, 'adam_beta1': 0.99, 'adam_beta2': 0.9933553670097386}.\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 31.1/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 17/infinite (1 PENDING, 1 RUNNING, 15 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_c980bae4 completed. Last result: loss=0.5422758460044861,matthews_correlation=0.32496815807366203\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 67%|██████▋   | 6/9 [00:00<00:00, 57.84ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 40.01ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 102.16ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 81.15ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m {'train_runtime': 517.8412, 'train_samples_per_second': 7.087, 'epoch': 13.69}\n",
-      "\u001b[2m\u001b[36m(pid=945904)\u001b[0m {'eval_loss': 0.9238015413284302, 'eval_matthews_correlation': 0.5494735380761103, 'eval_runtime': 1.588, 'eval_samples_per_second': 656.816, 'epoch': 13.69}\n",
-      "Trial train_distilbert_6d0d29d6 reported loss=0.9238015413284302,matthews_correlation=0.5494735380761103 with parameters={'num_train_epochs': 13.693961965290004, 'learning_rate': 1.1554924963127978e-05, 'adam_epsilon': 9.975195386335823e-08, 'adam_beta1': 0.9657731407927772, 'adam_beta2': 0.9951819129873288}.\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 32.2/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 18/infinite (1 PENDING, 1 RUNNING, 16 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_6d0d29d6 completed. Last result: loss=0.9238015413284302,matthews_correlation=0.5494735380761103\n",
-      "\u001b[2m\u001b[36m(pid=973869)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 78%|███████▊  | 7/9 [00:00<00:00, 66.59ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 44.15ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 125.62ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 119.07ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=973869)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=973869)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=973869)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=973869)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=973869)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=973869)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=973869)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=973869)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=973869)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=973869)\u001b[0m {'train_runtime': 129.7296, 'train_samples_per_second': 6.675, 'epoch': 3.23}\n",
-      "\u001b[2m\u001b[36m(pid=973869)\u001b[0m {'eval_loss': 0.5334658622741699, 'eval_matthews_correlation': 0.4513069078434825, 'eval_runtime': 1.7406, 'eval_samples_per_second': 599.205, 'epoch': 3.23}\n",
-      "Trial train_distilbert_b16ea82a reported loss=0.5334658622741699,matthews_correlation=0.4513069078434825 with parameters={'num_train_epochs': 3.229299080310228, 'learning_rate': 1.1914452449037918e-05, 'adam_epsilon': 2.9318239583972084e-08, 'adam_beta1': 0.9527814765481193, 'adam_beta2': 0.9939312818847007}.\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 31.2/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 19/infinite (1 PENDING, 1 RUNNING, 17 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_b16ea82a completed. Last result: loss=0.5334658622741699,matthews_correlation=0.4513069078434825\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 67%|██████▋   | 6/9 [00:00<00:00, 55.88ba/s]\n",
-      " 89%|████████▉ | 8/9 [00:00<00:00, 33.97ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 39.36ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 94.15ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 105.63ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m {'train_runtime': 604.2581, 'train_samples_per_second': 6.911, 'epoch': 15.58}\n",
-      "\u001b[2m\u001b[36m(pid=978003)\u001b[0m {'eval_loss': 0.9832845330238342, 'eval_matthews_correlation': 0.5699304939602442, 'eval_runtime': 1.7051, 'eval_samples_per_second': 611.691, 'epoch': 15.58}\n",
-      "Trial train_distilbert_eddf7cc0 reported loss=0.9832845330238342,matthews_correlation=0.5699304939602442 with parameters={'num_train_epochs': 15.580684188655825, 'learning_rate': 1.2851507818900338e-05, 'adam_epsilon': 8.134982521948352e-08, 'adam_beta1': 0.99, 'adam_beta2': 0.9971094424784387}.\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 31.2/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 20/infinite (1 PENDING, 1 RUNNING, 18 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_eddf7cc0 completed. Last result: loss=0.9832845330238342,matthews_correlation=0.5699304939602442\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 67%|██████▋   | 6/9 [00:00<00:00, 53.75ba/s]\n",
-      " 89%|████████▉ | 8/9 [00:00<00:00, 32.34ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 37.56ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 106.80ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 106.92ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m {'train_runtime': 464.4178, 'train_samples_per_second': 6.946, 'epoch': 12.04}\n",
-      "\u001b[2m\u001b[36m(pid=1000417)\u001b[0m {'eval_loss': 0.8574612736701965, 'eval_matthews_correlation': 0.5200220944545176, 'eval_runtime': 1.6294, 'eval_samples_per_second': 640.118, 'epoch': 12.04}\n",
-      "Trial train_distilbert_43008974 reported loss=0.8574612736701965,matthews_correlation=0.5200220944545176 with parameters={'num_train_epochs': 12.035709859477459, 'learning_rate': 1.0389153769735843e-05, 'adam_epsilon': 1e-07, 'adam_beta1': 0.929089139333885, 'adam_beta2': 0.9932581096367817}.\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 31.4/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 21/infinite (1 PENDING, 1 RUNNING, 19 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Trial train_distilbert_43008974 completed. Last result: loss=0.8574612736701965,matthews_correlation=0.5200220944545176\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
-      "  0%|          | 0/9 [00:00<?, ?ba/s]\n",
-      " 67%|██████▋   | 6/9 [00:00<00:00, 57.01ba/s]\n",
-      "100%|██████████| 9/9 [00:00<00:00, 38.68ba/s]\n",
-      "  0%|          | 0/2 [00:00<?, ?ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 94.45ba/s]\n",
-      "100%|██████████| 2/2 [00:00<00:00, 106.71ba/s]\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m   warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m {'train_runtime': 464.3935, 'train_samples_per_second': 6.878, 'epoch': 11.92}\n",
-      "2021-05-07 03:42:29,943\tINFO stopper.py:347 -- Reached timeout of 3600 seconds. Stopping all trials.\n",
-      "\u001b[2m\u001b[36m(pid=1022436)\u001b[0m {'eval_loss': 0.8282045722007751, 'eval_matthews_correlation': 0.5261643004428046, 'eval_runtime': 1.6945, 'eval_samples_per_second': 615.509, 'epoch': 11.92}\n",
-      "Trial train_distilbert_b3408a4e reported loss=0.8282045722007751,matthews_correlation=0.5261643004428046 with parameters={'num_train_epochs': 11.916545357570307, 'learning_rate': 1.104181373740261e-05, 'adam_epsilon': 4.674408006864194e-08, 'adam_beta1': 0.99, 'adam_beta2': 0.9938089843451122}.\n"
-     ]
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 32.0/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 0/4 CPUs, 0/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 22/infinite (22 TERMINATED)<br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "display_data",
-     "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "== Status ==<br>Memory usage on this node: 32.0/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 0/4 CPUs, 0/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 22/infinite (22 TERMINATED)<br><table>\n<thead>\n<tr><th>Trial name               </th><th>status    </th><th>loc  </th><th style=\"text-align: right;\">  adam_beta1</th><th style=\"text-align: right;\">  adam_beta2</th><th style=\"text-align: right;\">  adam_epsilon</th><th style=\"text-align: right;\">  learning_rate</th><th style=\"text-align: right;\">  num_train_epochs</th><th style=\"text-align: right;\">  iter</th><th style=\"text-align: right;\">  total time (s)</th><th style=\"text-align: right;\">    loss</th><th style=\"text-align: right;\">  matthews_correlation</th></tr>\n</thead>\n<tbody>\n<tr><td>train_distilbert_a0c303d0</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.939079</td><td style=\"text-align: right;\">    0.991865</td><td style=\"text-align: right;\">   7.96945e-08</td><td style=\"text-align: right;\">    5.61152e-06</td><td style=\"text-align: right;\">           1      </td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">         55.6909</td><td style=\"text-align: right;\">0.587986</td><td style=\"text-align: right;\">             0        </td></tr>\n<tr><td>train_distilbert_a0c303d1</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.811036</td><td style=\"text-align: right;\">    0.997214</td><td style=\"text-align: right;\">   2.05111e-09</td><td style=\"text-align: right;\">    2.05134e-06</td><td style=\"text-align: right;\">           1.44427</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">         71.7663</td><td style=\"text-align: right;\">0.603018</td><td style=\"text-align: right;\">             0        </td></tr>\n<tr><td>train_distilbert_c39b2ef0</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.909395</td><td style=\"text-align: right;\">    0.993715</td><td style=\"text-align: right;\">   1e-07      </td><td style=\"text-align: right;\">    5.26543e-06</td><td style=\"text-align: right;\">           1      </td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">         53.7619</td><td style=\"text-align: right;\">0.586518</td><td style=\"text-align: right;\">             0        </td></tr>\n<tr><td>train_distilbert_f00776e2</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.968763</td><td style=\"text-align: right;\">    0.990019</td><td style=\"text-align: right;\">   4.38943e-08</td><td style=\"text-align: right;\">    5.98035e-06</td><td style=\"text-align: right;\">           1.02723</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">         56.8382</td><td style=\"text-align: right;\">0.581313</td><td style=\"text-align: right;\">             0        </td></tr>\n<tr><td>train_distilbert_11ab3900</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.962198</td><td style=\"text-align: right;\">    0.991838</td><td style=\"text-align: right;\">   7.09296e-08</td><td style=\"text-align: right;\">    5.06608e-06</td><td style=\"text-align: right;\">           1      </td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">         54.0231</td><td style=\"text-align: right;\">0.585576</td><td style=\"text-align: right;\">             0        </td></tr>\n<tr><td>train_distilbert_353025b6</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.91596 </td><td style=\"text-align: right;\">    0.991892</td><td style=\"text-align: right;\">   8.95426e-08</td><td style=\"text-align: right;\">    6.21568e-06</td><td style=\"text-align: right;\">           2.15443</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">         98.3233</td><td style=\"text-align: right;\">0.531632</td><td style=\"text-align: right;\">             0.388893 </td></tr>\n<tr><td>train_distilbert_5728a1de</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.926933</td><td style=\"text-align: right;\">    0.993146</td><td style=\"text-align: right;\">   1e-07      </td><td style=\"text-align: right;\">    1.00902e-05</td><td style=\"text-align: right;\">           1      </td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">         55.3726</td><td style=\"text-align: right;\">0.538505</td><td style=\"text-align: right;\">             0.280558 </td></tr>\n<tr><td>train_distilbert_9394c2e2</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.928106</td><td style=\"text-align: right;\">    0.990614</td><td style=\"text-align: right;\">   4.49975e-08</td><td style=\"text-align: right;\">    3.45674e-06</td><td style=\"text-align: right;\">           2.72935</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">        121.388 </td><td style=\"text-align: right;\">0.539177</td><td style=\"text-align: right;\">             0.327295 </td></tr>\n<tr><td>train_distilbert_b6543fec</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.876896</td><td style=\"text-align: right;\">    0.992098</td><td style=\"text-align: right;\">   1e-07      </td><td style=\"text-align: right;\">    7.01176e-06</td><td style=\"text-align: right;\">           1.59538</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">         76.0244</td><td style=\"text-align: right;\">0.527516</td><td style=\"text-align: right;\">             0.379177 </td></tr>\n<tr><td>train_distilbert_0071f998</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.955024</td><td style=\"text-align: right;\">    0.991687</td><td style=\"text-align: right;\">   7.39776e-08</td><td style=\"text-align: right;\">    5.50998e-06</td><td style=\"text-align: right;\">           2.90939</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">        126.871 </td><td style=\"text-align: right;\">0.516225</td><td style=\"text-align: right;\">             0.417157 </td></tr>\n<tr><td>train_distilbert_2f830be6</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.886931</td><td style=\"text-align: right;\">    0.989628</td><td style=\"text-align: right;\">   7.6127e-08 </td><td style=\"text-align: right;\">    4.37646e-06</td><td style=\"text-align: right;\">           1.53338</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">         73.8934</td><td style=\"text-align: right;\">0.551629</td><td style=\"text-align: right;\">             0.0655887</td></tr>\n<tr><td>train_distilbert_7ce03f12</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.984053</td><td style=\"text-align: right;\">    0.993956</td><td style=\"text-align: right;\">   8.70144e-08</td><td style=\"text-align: right;\">    7.82557e-06</td><td style=\"text-align: right;\">           4.08775</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">        174.027 </td><td style=\"text-align: right;\">0.523732</td><td style=\"text-align: right;\">             0.453549 </td></tr>\n<tr><td>train_distilbert_aaab0508</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.940707</td><td style=\"text-align: right;\">    0.993946</td><td style=\"text-align: right;\">   1e-07      </td><td style=\"text-align: right;\">    8.91979e-06</td><td style=\"text-align: right;\">           3.40243</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">        146.249 </td><td style=\"text-align: right;\">0.511288</td><td style=\"text-align: right;\">             0.45085  </td></tr>\n<tr><td>train_distilbert_14262454</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.99    </td><td style=\"text-align: right;\">    0.991696</td><td style=\"text-align: right;\">   4.60093e-08</td><td style=\"text-align: right;\">    4.83405e-06</td><td style=\"text-align: right;\">           3.4954 </td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">        152.008 </td><td style=\"text-align: right;\">0.53506 </td><td style=\"text-align: right;\">             0.400851 </td></tr>\n<tr><td>train_distilbert_6d211fe6</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.959277</td><td style=\"text-align: right;\">    0.994556</td><td style=\"text-align: right;\">   5.40791e-08</td><td style=\"text-align: right;\">    1.17333e-05</td><td style=\"text-align: right;\">           6.64995</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">        271.444 </td><td style=\"text-align: right;\">0.609851</td><td style=\"text-align: right;\">             0.526802 </td></tr>\n<tr><td>train_distilbert_c980bae4</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.99    </td><td style=\"text-align: right;\">    0.993355</td><td style=\"text-align: right;\">   1e-07      </td><td style=\"text-align: right;\">    5.21929e-06</td><td style=\"text-align: right;\">           2.51275</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">        111.799 </td><td style=\"text-align: right;\">0.542276</td><td style=\"text-align: right;\">             0.324968 </td></tr>\n<tr><td>train_distilbert_6d0d29d6</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.965773</td><td style=\"text-align: right;\">    0.995182</td><td style=\"text-align: right;\">   9.9752e-08 </td><td style=\"text-align: right;\">    1.15549e-05</td><td style=\"text-align: right;\">          13.694  </td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">        527.944 </td><td style=\"text-align: right;\">0.923802</td><td style=\"text-align: right;\">             0.549474 </td></tr>\n<tr><td>train_distilbert_b16ea82a</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.952781</td><td style=\"text-align: right;\">    0.993931</td><td style=\"text-align: right;\">   2.93182e-08</td><td style=\"text-align: right;\">    1.19145e-05</td><td style=\"text-align: right;\">           3.2293 </td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">        139.844 </td><td style=\"text-align: right;\">0.533466</td><td style=\"text-align: right;\">             0.451307 </td></tr>\n<tr><td>train_distilbert_eddf7cc0</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.99    </td><td style=\"text-align: right;\">    0.997109</td><td style=\"text-align: right;\">   8.13498e-08</td><td style=\"text-align: right;\">    1.28515e-05</td><td style=\"text-align: right;\">          15.5807 </td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">        614.789 </td><td style=\"text-align: right;\">0.983285</td><td style=\"text-align: right;\">             0.56993  </td></tr>\n<tr><td>train_distilbert_43008974</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.929089</td><td style=\"text-align: right;\">    0.993258</td><td style=\"text-align: right;\">   1e-07      </td><td style=\"text-align: right;\">    1.03892e-05</td><td style=\"text-align: right;\">          12.0357 </td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">        474.387 </td><td style=\"text-align: right;\">0.857461</td><td style=\"text-align: right;\">             0.520022 </td></tr>\n<tr><td>train_distilbert_b3408a4e</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.99    </td><td style=\"text-align: right;\">    0.993809</td><td style=\"text-align: right;\">   4.67441e-08</td><td style=\"text-align: right;\">    1.10418e-05</td><td style=\"text-align: right;\">          11.9165 </td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">        474.126 </td><td style=\"text-align: right;\">0.828205</td><td style=\"text-align: right;\">             0.526164 </td></tr>\n<tr><td>train_distilbert_cfbfb220</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.979454</td><td style=\"text-align: right;\">    0.9999  </td><td style=\"text-align: right;\">   1e-07      </td><td style=\"text-align: right;\">    1.49578e-05</td><td style=\"text-align: right;\">          20.3715 </td><td style=\"text-align: right;\">      </td><td style=\"text-align: right;\">                </td><td style=\"text-align: right;\">        </td><td style=\"text-align: right;\">                      </td></tr>\n</tbody>\n</table><br><br>"
-     },
-     "metadata": {}
-    },
-    {
-     "output_type": "stream",
-     "name": "stderr",
-     "text": [
-      "2021-05-07 03:42:30,035\tINFO tune.py:450 -- Total run time: 3992.00 seconds (3991.90 seconds for the tuning loop).\n"
+      "\u001b[2m\u001b[36m(pid=11344)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "\u001b[2m\u001b[36m(pid=11344)\u001b[0m To disable this warning, you can either:\n",
+      "\u001b[2m\u001b[36m(pid=11344)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n",
+      "\u001b[2m\u001b[36m(pid=11344)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "\u001b[2m\u001b[36m(pid=11344)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "\u001b[2m\u001b[36m(pid=11344)\u001b[0m To disable this warning, you can either:\n",
+      "\u001b[2m\u001b[36m(pid=11344)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n",
+      "\u001b[2m\u001b[36m(pid=11344)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
      ]
     }
    ],
@@ -1525,8 +893,7 @@
     "        metric=HP_METRIC,\n",
     "        mode=MODE,\n",
     "        low_cost_partial_config={\"num_train_epochs\": 1}),\n",
-    "    report_intermediate_result=False,\n",
-    "    # uncomment the following if report_intermediate_result = True\n",
+    "    # uncomment the following if scheduler = 'auto',\n",
     "    # max_resource=max_num_epoch, min_resource=1,\n",
     "    resources_per_trial={\"gpu\": num_gpus, \"cpu\": num_cpus},\n",
     "    local_dir='logs/',\n",
@@ -1540,14 +907,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "output_type": "stream",
      "name": "stdout",
+     "output_type": "stream",
      "text": [
-      "n_trials=22\ntime=3999.769361972809\nBest model eval matthews_correlation: 0.5699\nBest model parameters: {'num_train_epochs': 15.580684188655825, 'learning_rate': 1.2851507818900338e-05, 'adam_epsilon': 8.134982521948352e-08, 'adam_beta1': 0.99, 'adam_beta2': 0.9971094424784387}\n"
+      "n_trials=22\n",
+      "time=3999.769361972809\n",
+      "Best model eval matthews_correlation: 0.5699\n",
+      "Best model parameters: {'num_train_epochs': 15.580684188655825, 'learning_rate': 1.2851507818900338e-05, 'adam_epsilon': 8.134982521948352e-08, 'adam_beta1': 0.99, 'adam_beta2': 0.9971094424784387}\n"
      ]
     }
    ],
@@ -1561,6 +931,8 @@
    ]
   },
   {
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
     "## Next Steps\n",
     "\n",
@@ -1568,15 +940,17 @@
     "\n",
     "- Huggingface provides _Callbacks_ which can be used to insert the `flaml.tune.report` call inside the training loop\n",
     "- Make sure to set `do_eval=True` in the `TrainingArguments` provided to `Trainer` and adjust the evaluation frequency accordingly"
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
+   ]
   }
  ],
  "metadata": {
+  "interpreter": {
+   "hash": "1cfcceddaeccda27c3cce104660d474924e2ba82887c0e8e481b6ede3743c483"
+  },
   "kernelspec": {
-   "name": "python385jvsc74a57bd031f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6",
-   "display_name": "Python 3.8.5 64-bit"
+   "display_name": "Python 3.8.5 64-bit",
+   "language": "python",
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1588,7 +962,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.8.12"
   },
   "metadata": {
    "interpreter": {
@@ -1598,4 +972,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/notebook/flaml_pytorch_cifar10.ipynb b/notebook/flaml_pytorch_cifar10.ipynb
index ec89f929b..7e096607b 100644
--- a/notebook/flaml_pytorch_cifar10.ipynb
+++ b/notebook/flaml_pytorch_cifar10.ipynb
@@ -286,7 +286,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "time_budget_s = 600     # time budget in seconds\n",
+    "time_budget_s = 3600     # time budget in seconds\n",
     "gpus_per_trial = 0.5    # number of gpus for each trial; 0.5 means two training jobs can share one gpu\n",
     "num_samples = 500       # maximal number of trials\n",
     "np.random.seed(7654321)"
@@ -315,7 +315,7 @@
     "    low_cost_partial_config={\"num_epochs\": 1},\n",
     "    max_resource=max_num_epoch,\n",
     "    min_resource=1,\n",
-    "    report_intermediate_result=True,  # only set to True when intermediate results are reported by tune.report\n",
+    "    scheduler=\"asha\",  # need to use tune.report to report intermediate results in training_function \n",
     "    resources_per_trial={\"cpu\": 1, \"gpu\": gpus_per_trial},\n",
     "    local_dir='logs/',\n",
     "    num_samples=num_samples,\n",
@@ -325,24 +325,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "#trials=44\n",
-      "time=1193.913584947586\n",
-      "Best trial config: {'l1': 8, 'l2': 8, 'lr': 0.0008818671030627281, 'num_epochs': 55.9513429004283, 'batch_size': 3}\n",
-      "Best trial final validation loss: 1.0694482081472874\n",
-      "Best trial final validation accuracy: 0.6389\n",
-      "Files already downloaded and verified\n",
-      "Files already downloaded and verified\n",
-      "Best trial test set accuracy: 0.6294\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(f\"#trials={len(result.trials)}\")\n",
     "print(f\"time={time.time()-start_time}\")\n",
@@ -390,7 +375,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.12"
+   "version": "3.8.12"
   },
   "metadata": {
    "interpreter": {
diff --git a/test/automl/test_forecast.py b/test/automl/test_forecast.py
index f582d1119..5befb5307 100644
--- a/test/automl/test_forecast.py
+++ b/test/automl/test_forecast.py
@@ -71,7 +71,7 @@ def test_forecast_automl(budget=5):
     ) = get_output_from_log(filename=settings["log_file_name"], time_budget=budget)
     for config in config_history:
         print(config)
-    print(automl.prune_attr)
+    print(automl.resource_attr)
     print(automl.max_resource)
     print(automl.min_resource)
 
@@ -210,7 +210,7 @@ def test_multivariate_forecast_num(budget=5):
     ) = get_output_from_log(filename=settings["log_file_name"], time_budget=budget)
     for config in config_history:
         print(config)
-    print(automl.prune_attr)
+    print(automl.resource_attr)
     print(automl.max_resource)
     print(automl.min_resource)
 
@@ -341,7 +341,7 @@ def test_multivariate_forecast_cat(budget=5):
     ) = get_output_from_log(filename=settings["log_file_name"], time_budget=budget)
     for config in config_history:
         print(config)
-    print(automl.prune_attr)
+    print(automl.resource_attr)
     print(automl.max_resource)
     print(automl.min_resource)
 
diff --git a/test/automl/test_notebook_example.py b/test/automl/test_notebook_example.py
index 515d569b1..33c9c15ce 100644
--- a/test/automl/test_notebook_example.py
+++ b/test/automl/test_notebook_example.py
@@ -64,7 +64,7 @@ def test_automl(budget=5, dataset_format="dataframe", hpo_method=None):
     ) = get_output_from_log(filename=settings["log_file_name"], time_budget=6)
     for config in config_history:
         print(config)
-    print(automl.prune_attr)
+    print(automl.resource_attr)
     print(automl.max_resource)
     print(automl.min_resource)
 
diff --git a/test/automl/test_python_log.py b/test/automl/test_python_log.py
index c367600c1..3681fcd45 100644
--- a/test/automl/test_python_log.py
+++ b/test/automl/test_python_log.py
@@ -80,7 +80,7 @@ class TestLogging(unittest.TestCase):
                 low_cost_partial_config=low_cost_partial_config,
                 points_to_evaluate=automl.points_to_evaluate,
                 cat_hp_cost=automl.cat_hp_cost,
-                prune_attr=automl.prune_attr,
+                resource_attr=automl.resource_attr,
                 min_resource=automl.min_resource,
                 max_resource=automl.max_resource,
                 config_constraints=[
diff --git a/test/automl/test_xgboost2d.py b/test/automl/test_xgboost2d.py
index 2c17850a0..992420cd4 100644
--- a/test/automl/test_xgboost2d.py
+++ b/test/automl/test_xgboost2d.py
@@ -71,7 +71,7 @@ def test_simple(method=None):
         low_cost_partial_config=automl.low_cost_partial_config,
         points_to_evaluate=automl.points_to_evaluate,
         cat_hp_cost=automl.cat_hp_cost,
-        prune_attr=automl.prune_attr,
+        resource_attr=automl.resource_attr,
         min_resource=automl.min_resource,
         max_resource=automl.max_resource,
         time_budget_s=automl._state.time_budget,
diff --git a/test/tune/test_pytorch_cifar10.py b/test/tune/test_pytorch_cifar10.py
index 3087f4548..2151bf281 100644
--- a/test/tune/test_pytorch_cifar10.py
+++ b/test/tune/test_pytorch_cifar10.py
@@ -239,7 +239,7 @@ def cifar10_main(
             low_cost_partial_config={"num_epochs": 1},
             max_resource=max_num_epochs,
             min_resource=1,
-            report_intermediate_result=True,
+            scheduler="asha",
             resources_per_trial={"cpu": 1, "gpu": gpus_per_trial},
             local_dir="logs/",
             num_samples=num_samples,
diff --git a/test/tune/test_scheduler.py b/test/tune/test_scheduler.py
new file mode 100644
index 000000000..906d03619
--- /dev/null
+++ b/test/tune/test_scheduler.py
@@ -0,0 +1,157 @@
+"""Require: pip install flaml[test,ray]
+"""
+from logging import raiseExceptions
+from flaml.scheduler.trial_scheduler import TrialScheduler
+import numpy as np
+from flaml import tune
+import time
+
+
+def rand_vector_unit_sphere(dim):
+    """this function allows you to generate
+    points that uniformly distribute on
+    the (dim-1)-sphere.
+    """
+    vec = np.random.normal(0, 1, dim)
+    mag = np.linalg.norm(vec)
+    return vec / mag
+
+
+def simple_obj(config, resource=10000):
+    config_value_vector = np.array([config["x"], config["y"], config["z"]])
+    score_sequence = []
+    for i in range(resource):
+        a = rand_vector_unit_sphere(3)
+        a[2] = abs(a[2])
+        point_projection = np.dot(config_value_vector, a)
+        score_sequence.append(point_projection)
+    score_avg = np.mean(np.array(score_sequence))
+    score_std = np.std(np.array(score_sequence))
+    score_lb = score_avg - 1.96 * score_std / np.sqrt(resource)
+    tune.report(samplesize=resource, sphere_projection=score_lb)
+
+
+def obj_w_intermediate_report(resource, config):
+    config_value_vector = np.array([config["x"], config["y"], config["z"]])
+    score_sequence = []
+    for i in range(resource):
+        a = rand_vector_unit_sphere(3)
+        a[2] = abs(a[2])
+        point_projection = np.dot(config_value_vector, a)
+        score_sequence.append(point_projection)
+        if (i + 1) % 100 == 0:
+            score_avg = np.mean(np.array(score_sequence))
+            score_std = np.std(np.array(score_sequence))
+            score_lb = score_avg - 1.96 * score_std / np.sqrt(i + 1)
+            tune.report(samplesize=i + 1, sphere_projection=score_lb)
+
+
+def obj_w_suggested_resource(resource_attr, config):
+    resource = config[resource_attr]
+    simple_obj(config, resource)
+
+
+def test_scheduler(scheduler=None):
+    from functools import partial
+
+    resource_attr = "samplesize"
+    max_resource = 10000
+
+    # specify the objective functions
+    if scheduler is None:
+        evaluation_obj = simple_obj
+    elif scheduler == "flaml":
+        evaluation_obj = partial(obj_w_suggested_resource, resource_attr)
+    elif scheduler == "asha" or isinstance(scheduler, TrialScheduler):
+        evaluation_obj = partial(obj_w_intermediate_report, max_resource)
+    else:
+        try:
+            from ray.tune.schedulers import TrialScheduler as RayTuneTrialScheduler
+        except ImportError:
+            print(
+                "skip this condition, which may require TrialScheduler from ray tune, \
+                as ray tune cannot be imported."
+            )
+            return
+        if isinstance(scheduler, RayTuneTrialScheduler):
+            evaluation_obj = partial(obj_w_intermediate_report, max_resource)
+        else:
+            raise ValueError
+
+    analysis = tune.run(
+        evaluation_obj,
+        config={
+            "x": tune.uniform(5, 20),
+            "y": tune.uniform(0, 10),
+            "z": tune.uniform(0, 10),
+        },
+        metric="sphere_projection",
+        mode="max",
+        verbose=1,
+        resource_attr=resource_attr,
+        scheduler=scheduler,
+        max_resource=max_resource,
+        min_resource=100,
+        reduction_factor=2,
+        time_budget_s=1,
+        num_samples=500,
+    )
+
+    print("Best hyperparameters found were: ", analysis.best_config)
+    # print(analysis.get_best_trial)
+    return analysis.best_config
+
+
+def test_no_scheduler():
+    best_config = test_scheduler()
+    print("No scheduler, test error:", abs(10 / 2 - best_config["z"] / 2))
+
+
+def test_asha_scheduler():
+    try:
+        from ray.tune.schedulers import ASHAScheduler
+    except ImportError:
+        print("skip the test as ray tune cannot be imported.")
+        return
+    best_config = test_scheduler(scheduler="asha")
+    print("Auto ASHA scheduler, test error:", abs(10 / 2 - best_config["z"] / 2))
+
+
+def test_custom_scheduler():
+    try:
+        from ray.tune.schedulers import HyperBandScheduler
+    except ImportError:
+        print("skip the test as ray tune cannot be imported.")
+        return
+    my_scheduler = HyperBandScheduler(
+        time_attr="samplesize", max_t=1000, reduction_factor=2
+    )
+    best_config = test_scheduler(scheduler=my_scheduler)
+    print("Custom ASHA scheduler, test error:", abs(10 / 2 - best_config["z"] / 2))
+
+
+def test_custom_scheduler_default_time_attr():
+    try:
+        from ray.tune.schedulers import ASHAScheduler
+    except ImportError:
+        print("skip the test as ray tune cannot be imported.")
+        return
+    my_scheduler = ASHAScheduler(max_t=10)
+    best_config = test_scheduler(scheduler=my_scheduler)
+    print(
+        "Custom ASHA scheduler (with ASHA default time attr), test error:",
+        abs(10 / 2 - best_config["z"] / 2),
+    )
+
+
+def test_flaml_scheduler():
+    best_config = test_scheduler(scheduler="flaml")
+    print("FLAML scheduler, test error", abs(10 / 2 - best_config["z"] / 2))
+
+
+if __name__ == "__main__":
+    test_no_scheduler()
+    test_asha_scheduler()
+    test_custom_scheduler()
+    test_custom_scheduler_default_time_attr()
+    test_flaml_scheduler()
diff --git a/test/tune/test_tune.py b/test/tune/test_tune.py
index d4bfbd24b..ab7232a08 100644
--- a/test/tune/test_tune.py
+++ b/test/tune/test_tune.py
@@ -83,7 +83,7 @@ def _test_xgboost(method="BlendSearch"):
                     mode="min",
                     max_resource=max_iter,
                     min_resource=1,
-                    report_intermediate_result=True,
+                    scheduler="asha",
                     # You can add "gpu": 0.1 to allocate GPUs
                     resources_per_trial={"cpu": 1},
                     local_dir="logs/",