diff --git a/flaml/automl.py b/flaml/automl.py index b6f1c434c..be0fb18ca 100644 --- a/flaml/automl.py +++ b/flaml/automl.py @@ -845,7 +845,7 @@ class AutoML: if eval_method == 'auto' or self._state.X_val is not None: eval_method = self._decide_eval_method(time_budget) self._state.eval_method = eval_method - if not mlflow or not mlflow.active_run() and not logger.handler: + if (not mlflow or not mlflow.active_run()) and not logger.handlers: # Add the console handler. _ch = logging.StreamHandler() _ch.setFormatter(logger_formatter) @@ -1074,7 +1074,7 @@ class AutoML: search_state.best_config, estimator, search_state.sample_size) - if mlflow is not None: + if mlflow is not None and mlflow.active_run(): with mlflow.start_run(nested=True) as run: mlflow.log_metric('iter_counter', self._iter_per_learner[estimator]) diff --git a/flaml/searcher/blendsearch.py b/flaml/searcher/blendsearch.py index 62701e435..278a53ce9 100644 --- a/flaml/searcher/blendsearch.py +++ b/flaml/searcher/blendsearch.py @@ -25,6 +25,8 @@ class BlendSearch(Searcher): '''class for BlendSearch algorithm ''' + cost_attr = "time_total_s" # cost attribute in result + def __init__(self, metric: Optional[str] = None, mode: Optional[str] = None, @@ -193,7 +195,7 @@ class BlendSearch(Searcher): self._search_thread_pool[self._thread_count] = SearchThread( self._ls.mode, self._ls.create(config, result[self._metric], cost=result[ - "time_total_s"]) + self.cost_attr]) ) thread_id = self._thread_count self._thread_count += 1 @@ -393,7 +395,89 @@ class BlendSearch(Searcher): return True -class CFO(BlendSearch): +try: + from nni.tuner import Tuner as NNITuner + from nni.utils import extract_scalar_reward + try: + from ray.tune import (uniform, quniform, choice, randint, qrandint, randn, + qrandn, loguniform, qloguniform) + except: + from .sample import (uniform, quniform, choice, randint, qrandint, randn, + qrandn, loguniform, qloguniform) + + class BlendSearchTuner(BlendSearch, NNITuner): + '''Tuner class for NNI + ''' + + def receive_trial_result(self, parameter_id, parameters, value, + **kwargs): + ''' + Receive trial's final result. + parameter_id: int + parameters: object created by 'generate_parameters()' + value: final metrics of the trial, including default metric + ''' + result = {} + for key, value in parameters: + result['config/'+key] = value + reward = extract_scalar_reward(value) + result[self._metric] = reward + # if nni does not report training cost, + # using sequence as an approximation. + # if no sequence, using a constant 1 + result[self.cost_attr] = value.get(self.cost_attr, value.get( + 'sequence', 1)) + self.on_trial_complete(str(parameter_id), result) + ... + + def generate_parameters(self, parameter_id, **kwargs) -> Dict: + ''' + Returns a set of trial (hyper-)parameters, as a serializable object + parameter_id: int + ''' + return self.suggest(str(parameter_id)) + ... + + def update_search_space(self, search_space): + ''' + Tuners are advised to support updating search space at run-time. + If a tuner can only set search space once before generating first hyper-parameters, + it should explicitly document this behaviour. + search_space: JSON object created by experiment owner + ''' + config = {} + for key, value in search_space: + v = value.get("_value") + _type = value['_type'] + if _type == 'choice': + config[key] = choice(v) + elif _type == 'randint': + config[key] = randint(v[0], v[1]-1) + elif _type == 'uniform': + config[key] = uniform(v[0], v[1]) + elif _type == 'quniform': + config[key] = quniform(v[0], v[1], v[2]) + elif _type == 'loguniform': + config[key] = loguniform(v[0], v[1]) + elif _type == 'qloguniform': + config[key] = qloguniform(v[0], v[1], v[2]) + elif _type == 'normal': + config[key] = randn(v[1], v[2]) + elif _type == 'qnormal': + config[key] = qrandn(v[1], v[2], v[3]) + else: + raise ValueError( + f'unsupported type in search_space {_type}') + self._ls.set_search_properties(None, None, config) + if self._gs is not None: + self._gs.set_search_properties(None, None, config) + self._init_search() + +except: + class BlendSearchTuner(BlendSearch): pass + + +class CFO(BlendSearchTuner): ''' class for CFO algorithm ''' @@ -416,3 +500,5 @@ class CFO(BlendSearch): ''' create thread condition ''' return len(self._search_thread_pool) < 2 + + diff --git a/flaml/searcher/flow2.py b/flaml/searcher/flow2.py index 3784ef957..681956867 100644 --- a/flaml/searcher/flow2.py +++ b/flaml/searcher/flow2.py @@ -9,9 +9,10 @@ try: from ray.tune.suggest import Searcher from ray.tune.suggest.variant_generator import generate_variants from ray.tune import sample + from ray.tune.utils.util import flatten_dict, unflatten_dict except ImportError: from .suggestion import Searcher - from .variant_generator import generate_variants + from .variant_generator import generate_variants, flatten_dict, unflatten_dict from ..tune import sample @@ -86,6 +87,7 @@ class FLOW2(Searcher): elif mode == "min": self.metric_op = 1. self.space = space or {} + self.space = flatten_dict(self.space, prevent_delimiter=True) self._random = np.random.RandomState(seed) self._seed = seed if not init_config: @@ -95,7 +97,8 @@ class FLOW2(Searcher): "consider providing init values for cost-related hps via " "'init_config'." ) - self.init_config = self.best_config = init_config + self.init_config = init_config + self.best_config = flatten_dict(init_config) self.cat_hp_cost = cat_hp_cost self.prune_attr = prune_attr self.min_resource = min_resource @@ -171,7 +174,7 @@ class FLOW2(Searcher): # logger.info(self._resource) else: self._resource = None self.incumbent = {} - self.incumbent = self.normalize(self.init_config) + self.incumbent = self.normalize(self.best_config) # flattened self.best_obj = self.cost_incumbent = None self.dim = len(self._tunable_keys) # total # tunable dimensions self._direction_tried = None @@ -247,7 +250,7 @@ class FLOW2(Searcher): if key not in self._unordered_cat_hp: if upper and lower: u, l = upper[key], lower[key] - gauss_std = u-l + gauss_std = u-l or self.STEPSIZE # allowed bound u += self.STEPSIZE l -= self.STEPSIZE @@ -261,11 +264,11 @@ class FLOW2(Searcher): normalized[key] = max(l, min(u, normalized[key] + delta)) # use best config for unordered cat choice config = self.denormalize(normalized) - self._reset_times += 1 else: # first time init_config, or other configs, take as is config = partial_config.copy() - + if partial_config == self.init_config: self._reset_times += 1 + config = flatten_dict(config) for key, value in self.space.items(): if key not in config: config[key] = value @@ -277,13 +280,13 @@ class FLOW2(Searcher): if self._resource: config[self.prune_attr] = self.min_resource - return config + return unflatten_dict(config) def create(self, init_config: Dict, obj: float, cost: float) -> Searcher: flow2 = FLOW2(init_config, self.metric, self.mode, self._cat_hp_cost, - self.space, self.prune_attr, self.min_resource, - self.max_resource, self.resource_multiple_factor, - self._seed+1) + unflatten_dict(self.space), self.prune_attr, + self.min_resource, self.max_resource, + self.resource_multiple_factor, self._seed+1) flow2.best_obj = obj * self.metric_op # minimize internally flow2.cost_incumbent = cost return flow2 @@ -292,7 +295,7 @@ class FLOW2(Searcher): ''' normalize each dimension in config to [0,1] ''' config_norm = {} - for key, value in config.items(): + for key, value in flatten_dict(config).items(): if key in self.space: # domain: sample.Categorical/Integer/Float/Function domain = self.space[key] @@ -426,7 +429,7 @@ class FLOW2(Searcher): obj = result.get(self._metric) if obj: obj *= self.metric_op - if obj < self.best_obj: + if self.best_obj is None or obj < self.best_obj: self.best_obj, self.best_config = obj, self._configs[ trial_id] self.incumbent = self.normalize(self.best_config) @@ -437,7 +440,8 @@ class FLOW2(Searcher): self._cost_complete4incumbent = 0 self._num_allowed4incumbent = 2 * self.dim self._proposed_by.clear() - if self._K > 0: + if self._K > 0: + # self._oldK must have been set when self._K>0 self.step *= np.sqrt(self._K/self._oldK) if self.step > self.step_ub: self.step = self.step_ub self._iter_best_config = self.trial_count @@ -474,7 +478,7 @@ class FLOW2(Searcher): obj = result.get(self._metric) if obj: obj *= self.metric_op - if obj < self.best_obj: + if self.best_obj is None or obj < self.best_obj: self.best_obj = obj config = self._configs[trial_id] if self.best_config != config: @@ -533,7 +537,7 @@ class FLOW2(Searcher): config = self.denormalize(move) self._proposed_by[trial_id] = self.incumbent self._configs[trial_id] = config - return config + return unflatten_dict(config) def _project(self, config): ''' project normalized config in the feasible region and set prune_attr @@ -553,6 +557,7 @@ class FLOW2(Searcher): def config_signature(self, config) -> tuple: ''' return the signature tuple of a config ''' + config = flatten_dict(config) value_list = [] for key in self._space_keys: if key in config: diff --git a/flaml/searcher/search_thread.py b/flaml/searcher/search_thread.py index ed280ff46..84cb2f9f1 100644 --- a/flaml/searcher/search_thread.py +++ b/flaml/searcher/search_thread.py @@ -20,6 +20,7 @@ class SearchThread: ''' cost_attr = 'time_total_s' + eps = 1e-10 def __init__(self, mode: str = "min", search_alg: Optional[Searcher] = None): @@ -70,7 +71,7 @@ class SearchThread: # calculate speed; use 0 for invalid speed temporarily if self.obj_best2 > self.obj_best1: self.speed = (self.obj_best2 - self.obj_best1) / ( - self.cost_total - self.cost_best2) + self.cost_total - self.cost_best2 + self.eps) else: self.speed = 0 def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None, diff --git a/flaml/searcher/variant_generator.py b/flaml/searcher/variant_generator.py index d1a80e0b2..5427604ee 100644 --- a/flaml/searcher/variant_generator.py +++ b/flaml/searcher/variant_generator.py @@ -28,6 +28,46 @@ from ..tune.sample import Categorical, Domain, Function logger = logging.getLogger(__name__) +def flatten_dict(dt, delimiter="/", prevent_delimiter=False): + dt = copy.deepcopy(dt) + if prevent_delimiter and any(delimiter in key for key in dt): + # Raise if delimiter is any of the keys + raise ValueError( + "Found delimiter `{}` in key when trying to flatten array." + "Please avoid using the delimiter in your specification.") + while any(isinstance(v, dict) for v in dt.values()): + remove = [] + add = {} + for key, value in dt.items(): + if isinstance(value, dict): + for subkey, v in value.items(): + if prevent_delimiter and delimiter in subkey: + # Raise if delimiter is in any of the subkeys + raise ValueError( + "Found delimiter `{}` in key when trying to " + "flatten array. Please avoid using the delimiter " + "in your specification.") + add[delimiter.join([key, str(subkey)])] = v + remove.append(key) + dt.update(add) + for k in remove: + del dt[k] + return dt + + +def unflatten_dict(dt, delimiter="/"): + """Unflatten dict. Does not support unflattening lists.""" + dict_type = type(dt) + out = dict_type() + for key, val in dt.items(): + path = key.split(delimiter) + item = out + for k in path[:-1]: + item = item.setdefault(k, dict_type()) + item[path[-1]] = val + return out + + class TuneError(Exception): """General error class raised by ray.tune.""" pass diff --git a/flaml/tune/tune.py b/flaml/tune/tune.py index 7ce35e85d..ad8bebf0a 100644 --- a/flaml/tune/tune.py +++ b/flaml/tune/tune.py @@ -17,6 +17,8 @@ logger = logging.getLogger(__name__) _use_ray = True _runner = None _verbose = 0 +_running_trial = None +_training_iteration = 0 class ExperimentAnalysis(EA): @@ -68,6 +70,8 @@ def report(_metric=None, **kwargs): ''' global _use_ray global _verbose + global _running_trial + global _training_iteration if _use_ray: from ray import tune return tune.report(_metric, **kwargs) @@ -77,6 +81,12 @@ def report(_metric=None, **kwargs): logger.info(f"result: {kwargs}") if _metric: result['_default_anonymous_metric'] = _metric trial = _runner.running_trial + if _running_trial == trial: + _training_iteration += 1 + else: + _training_iteration = 0 + _running_trial = trial + result["training_iteration"] = _training_iteration result['config'] = trial.config for key, value in trial.config.items(): result['config/'+key] = value @@ -213,7 +223,7 @@ def run(training_function, import os os.makedirs(local_dir, exist_ok=True) logger.addHandler(logging.FileHandler(local_dir+'/tune_'+str( - datetime.datetime.now())+'.log')) + datetime.datetime.now()).replace(':', '-')+'.log')) if verbose<=2: logger.setLevel(logging.INFO) else: diff --git a/flaml/version.py b/flaml/version.py index fe404ae57..01ef12070 100644 --- a/flaml/version.py +++ b/flaml/version.py @@ -1 +1 @@ -__version__ = "0.2.5" +__version__ = "0.2.6" diff --git a/notebook/flaml_finetune_transformer.ipynb b/notebook/flaml_finetune_transformer.ipynb index 9bea4ef61..3e3943053 100644 --- a/notebook/flaml_finetune_transformer.ipynb +++ b/notebook/flaml_finetune_transformer.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 103, "metadata": {}, "outputs": [], "source": [ @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 104, "metadata": {}, "outputs": [], "source": [ @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ @@ -54,18 +54,18 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 106, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "{'input_ids': [101, 2023, 2003, 1037, 3231, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}" ] }, - "execution_count": 10, "metadata": {}, - "output_type": "execute_result" + "execution_count": 106 } ], "source": [ @@ -81,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 107, "metadata": {}, "outputs": [], "source": [ @@ -90,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 108, "metadata": {}, "outputs": [], "source": [ @@ -99,16 +99,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 109, "metadata": {}, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ - "Reusing dataset glue (/home/amin/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n", - "/home/amin/miniconda/lib/python3.7/site-packages/torch/cuda/__init__.py:52: UserWarning: CUDA initialization: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:100.)\n", - " return torch._C._cuda_getDeviceCount() > 0\n" + "Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n" ] } ], @@ -118,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ @@ -130,70 +128,16 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 111, "metadata": {}, "outputs": [ { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5bd7b23a478043eaaf6e14e119143fcd", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", "output_type": "stream", + "name": "stderr", "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d7b648c2dbdc4fb9907e43da7db8af9a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "36a9d6e62dbe462d94b1769f36fbd0f3", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" + "Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c3dd50f05994d4a5.arrow\n", + "Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-f2290a23c3c6f190.arrow\n", + "Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-6868a7b57fb52895.arrow\n" ] } ], @@ -203,10 +147,11 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 112, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", @@ -234,9 +179,8 @@ " 'sentence': \"Our friends won't buy this analysis, let alone the next one we propose.\"}" ] }, - "execution_count": 19, "metadata": {}, - "output_type": "execute_result" + "execution_count": 112 } ], "source": [ @@ -252,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 113, "metadata": {}, "outputs": [], "source": [ @@ -261,36 +205,15 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 114, "metadata": {}, "outputs": [ { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "35b76e51b5c8406fae416fcdc3dd885e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", "output_type": "stream", - "text": [ - "\n" - ] - }, - { "name": "stderr", - "output_type": "stream", "text": [ "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n", - "- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n", + "- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" @@ -304,10 +227,11 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 115, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "DistilBertForSequenceClassification(\n", @@ -425,9 +349,8 @@ ")" ] }, - "execution_count": 31, "metadata": {}, - "output_type": "execute_result" + "execution_count": 115 } ], "source": [ @@ -443,7 +366,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 116, "metadata": {}, "outputs": [], "source": [ @@ -452,31 +375,60 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 117, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "Metric(name: \"glue\", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: \"\"\"\n", "Compute GLUE evaluation metric associated to each GLUE dataset.\n", "Args:\n", - " predictions: list of translations to score.\n", + " predictions: list of predictions to score.\n", " Each translation should be tokenized into a list of tokens.\n", " references: list of lists of references for each translation.\n", " Each reference should be tokenized into a list of tokens.\n", "Returns: depending on the GLUE subset, one or several of:\n", " \"accuracy\": Accuracy\n", - " \"f1\": F1\n", + " \"f1\": F1 score\n", " \"pearson\": Pearson Correlation\n", " \"spearmanr\": Spearman Correlation\n", " \"matthews_correlation\": Matthew Correlation\n", + "Examples:\n", + "\n", + " >>> glue_metric = datasets.load_metric('glue', 'sst2') # 'sst2' or any of [\"mnli\", \"mnli_mismatched\", \"mnli_matched\", \"qnli\", \"rte\", \"wnli\", \"hans\"]\n", + " >>> references = [0, 1]\n", + " >>> predictions = [0, 1]\n", + " >>> results = glue_metric.compute(predictions=predictions, references=references)\n", + " >>> print(results)\n", + " {'accuracy': 1.0}\n", + "\n", + " >>> glue_metric = datasets.load_metric('glue', 'mrpc') # 'mrpc' or 'qqp'\n", + " >>> references = [0, 1]\n", + " >>> predictions = [0, 1]\n", + " >>> results = glue_metric.compute(predictions=predictions, references=references)\n", + " >>> print(results)\n", + " {'accuracy': 1.0, 'f1': 1.0}\n", + "\n", + " >>> glue_metric = datasets.load_metric('glue', 'stsb')\n", + " >>> references = [0., 1., 2., 3., 4., 5.]\n", + " >>> predictions = [0., 1., 2., 3., 4., 5.]\n", + " >>> results = glue_metric.compute(predictions=predictions, references=references)\n", + " >>> print({\"pearson\": round(results[\"pearson\"], 2), \"spearmanr\": round(results[\"spearmanr\"], 2)})\n", + " {'pearson': 1.0, 'spearmanr': 1.0}\n", + "\n", + " >>> glue_metric = datasets.load_metric('glue', 'cola')\n", + " >>> references = [0, 1]\n", + " >>> predictions = [0, 1]\n", + " >>> results = glue_metric.compute(predictions=predictions, references=references)\n", + " >>> print(results)\n", + " {'matthews_correlation': 1.0}\n", "\"\"\", stored examples: 0)" ] }, - "execution_count": 25, "metadata": {}, - "output_type": "execute_result" + "execution_count": 117 } ], "source": [ @@ -485,10 +437,11 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 118, "metadata": {}, "outputs": [], "source": [ + "import numpy as np\n", "def compute_metrics(eval_pred):\n", " predictions, labels = eval_pred\n", " predictions = np.argmax(predictions, axis=1)\n", @@ -504,7 +457,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 119, "metadata": {}, "outputs": [], "source": [ @@ -514,7 +467,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 120, "metadata": {}, "outputs": [], "source": [ @@ -526,7 +479,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ @@ -542,46 +495,9 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 122, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
| Step | \n", - "Training Loss | \n", - "
|---|
"
- ],
- "text/plain": [
- "
Memory usage on this node: 11.7/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 1/infinite (1 RUNNING)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "\u001b[2m\u001b[36m(pid=29589)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=29589)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=29589)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=29589)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=29589)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=29589)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=29589)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=29589)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=29589)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=29589)\u001b[0m {'train_runtime': 37.2833, 'train_samples_per_second': 7.188, 'epoch': 1.0}\n",
+ "Trial train_distilbert_21b2c490 reported matthews_correlation=0.00 with parameters={'num_train_epochs': 1, 'learning_rate': 5.61151641533451e-06, 'adam_epsilon': 7.969454818643929e-08, 'adam_beta1': 0.9390788489441669, 'adam_beta2': 0.99186521389353}.\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 16.0/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: 21b2c490 with matthews_correlation=0.0 and parameters={'num_train_epochs': 1, 'learning_rate': 5.61151641533451e-06, 'adam_epsilon': 7.969454818643929e-08, 'adam_beta1': 0.9390788489441669, 'adam_beta2': 0.99186521389353}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 2/infinite (1 PENDING, 1 RUNNING)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Trial train_distilbert_21b2c490 completed. Last result: loss=0.5786514282226562,matthews_correlation=0.0\n",
+ "\u001b[2m\u001b[36m(pid=29589)\u001b[0m {'eval_loss': 0.5786514282226562, 'eval_matthews_correlation': 0.0, 'eval_runtime': 1.8133, 'eval_samples_per_second': 575.184, 'epoch': 1.0}\n",
+ "\u001b[2m\u001b[36m(pid=29588)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=29588)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=29588)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=29588)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=29588)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=29588)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=29588)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=29588)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=29588)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=29588)\u001b[0m {'train_runtime': 205.6814, 'train_samples_per_second': 8.469, 'epoch': 6.5}\n",
+ "Trial train_distilbert_21b2c491 reported matthews_correlation=0.51 with parameters={'num_train_epochs': 6.496661243646011, 'learning_rate': 3.1345403715761375e-05, 'adam_epsilon': 1.2428131101359459e-08, 'adam_beta1': 0.9100859688137786, 'adam_beta2': 0.9850788361346603}.\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 16.5/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: 21b2c491 with matthews_correlation=0.5093030018169853 and parameters={'num_train_epochs': 6.496661243646011, 'learning_rate': 3.1345403715761375e-05, 'adam_epsilon': 1.2428131101359459e-08, 'adam_beta1': 0.9100859688137786, 'adam_beta2': 0.9850788361346603}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 3/infinite (1 PENDING, 1 RUNNING, 1 TERMINATED)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Trial train_distilbert_21b2c491 completed. Last result: loss=0.9910964965820312,matthews_correlation=0.5093030018169853\n",
+ "\u001b[2m\u001b[36m(pid=29588)\u001b[0m {'eval_loss': 0.9910964965820312, 'eval_matthews_correlation': 0.5093030018169853, 'eval_runtime': 1.8366, 'eval_samples_per_second': 567.883, 'epoch': 6.5}\n",
+ "\u001b[2m\u001b[36m(pid=29591)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=29591)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=29591)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=29591)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=29591)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=29591)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=29591)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=29591)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=29591)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=29591)\u001b[0m {'train_runtime': 37.2801, 'train_samples_per_second': 7.189, 'epoch': 1.0}\n",
+ "Trial train_distilbert_3f0da820 reported matthews_correlation=0.00 with parameters={'num_train_epochs': 1.0, 'learning_rate': 5.265428651017862e-06, 'adam_epsilon': 1e-07, 'adam_beta1': 0.9093950363089345, 'adam_beta2': 0.9937145453421068}.\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 16.7/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: 21b2c491 with matthews_correlation=0.5093030018169853 and parameters={'num_train_epochs': 6.496661243646011, 'learning_rate': 3.1345403715761375e-05, 'adam_epsilon': 1.2428131101359459e-08, 'adam_beta1': 0.9100859688137786, 'adam_beta2': 0.9850788361346603}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 4/infinite (1 PENDING, 1 RUNNING, 2 TERMINATED)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Trial train_distilbert_3f0da820 completed. Last result: loss=0.5775065422058105,matthews_correlation=0.0\n",
+ "\u001b[2m\u001b[36m(pid=29591)\u001b[0m {'eval_loss': 0.5775065422058105, 'eval_matthews_correlation': 0.0, 'eval_runtime': 1.7547, 'eval_samples_per_second': 594.388, 'epoch': 1.0}\n",
+ "\u001b[2m\u001b[36m(pid=29590)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=29590)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=29590)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=29590)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=29590)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=29590)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=29590)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=29590)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=29590)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=29590)\u001b[0m {'train_runtime': 197.3016, 'train_samples_per_second': 8.591, 'epoch': 6.32}\n",
+ "Trial train_distilbert_c1106c22 reported matthews_correlation=0.55 with parameters={'num_train_epochs': 6.324445967486241, 'learning_rate': 2.9412189965562634e-05, 'adam_epsilon': 2.256452443236495e-08, 'adam_beta1': 0.880402156178546, 'adam_beta2': 0.9869155143904086}.\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 15.9/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: c1106c22 with matthews_correlation=0.5451837431775948 and parameters={'num_train_epochs': 6.324445967486241, 'learning_rate': 2.9412189965562634e-05, 'adam_epsilon': 2.256452443236495e-08, 'adam_beta1': 0.880402156178546, 'adam_beta2': 0.9869155143904086}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 5/infinite (1 PENDING, 1 RUNNING, 3 TERMINATED)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Trial train_distilbert_c1106c22 completed. Last result: loss=0.8939734101295471,matthews_correlation=0.5451837431775948\n",
+ "\u001b[2m\u001b[36m(pid=29590)\u001b[0m {'eval_loss': 0.8939734101295471, 'eval_matthews_correlation': 0.5451837431775948, 'eval_runtime': 1.8277, 'eval_samples_per_second': 570.669, 'epoch': 6.32}\n",
+ "\u001b[2m\u001b[36m(pid=8754)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=8754)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=8754)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=8754)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=8754)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=8754)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=8754)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=8754)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=8754)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=8754)\u001b[0m {'train_runtime': 105.8952, 'train_samples_per_second': 7.847, 'epoch': 3.1}\n",
+ "Trial train_distilbert_de95f5e6 reported matthews_correlation=0.48 with parameters={'num_train_epochs': 3.097601049860023, 'learning_rate': 3.015866216468612e-05, 'adam_epsilon': 6.092346813998939e-09, 'adam_beta1': 0.9628888910610184, 'adam_beta2': 0.9832186589335725}.\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 16.3/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: c1106c22 with matthews_correlation=0.5451837431775948 and parameters={'num_train_epochs': 6.324445967486241, 'learning_rate': 2.9412189965562634e-05, 'adam_epsilon': 2.256452443236495e-08, 'adam_beta1': 0.880402156178546, 'adam_beta2': 0.9869155143904086}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 6/infinite (1 PENDING, 1 RUNNING, 4 TERMINATED)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Trial train_distilbert_de95f5e6 completed. Last result: loss=0.5720887780189514,matthews_correlation=0.48369222635456827\n",
+ "\u001b[2m\u001b[36m(pid=8754)\u001b[0m {'eval_loss': 0.5720887780189514, 'eval_matthews_correlation': 0.48369222635456827, 'eval_runtime': 1.8561, 'eval_samples_per_second': 561.936, 'epoch': 3.1}\n",
+ "\u001b[2m\u001b[36m(pid=12777)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=12777)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=12777)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=12777)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=12777)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=12777)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=12777)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=12777)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=12777)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=12777)\u001b[0m {'train_runtime': 330.1466, 'train_samples_per_second': 8.732, 'epoch': 10.76}\n",
+ "Trial train_distilbert_5bb0a1fc reported matthews_correlation=0.53 with parameters={'num_train_epochs': 10.755455977982155, 'learning_rate': 5.858103269448852e-05, 'adam_epsilon': 5.045085830072572e-08, 'adam_beta1': 0.845137019185222, 'adam_beta2': 0.9882166289933315}.\n",
+ "\u001b[2m\u001b[36m(pid=12777)\u001b[0m {'eval_loss': 1.5075323581695557, 'eval_matthews_correlation': 0.5282404248888111, 'eval_runtime': 1.7504, 'eval_samples_per_second': 595.853, 'epoch': 10.76}\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 15.9/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: c1106c22 with matthews_correlation=0.5451837431775948 and parameters={'num_train_epochs': 6.324445967486241, 'learning_rate': 2.9412189965562634e-05, 'adam_epsilon': 2.256452443236495e-08, 'adam_beta1': 0.880402156178546, 'adam_beta2': 0.9869155143904086}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 7/infinite (1 PENDING, 1 RUNNING, 5 TERMINATED)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Trial train_distilbert_5bb0a1fc completed. Last result: loss=1.5075323581695557,matthews_correlation=0.5282404248888111\n",
+ "\u001b[2m\u001b[36m(pid=39770)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=39770)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=39770)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=39770)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=39770)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=39770)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=39770)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=39770)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=39770)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=39770)\u001b[0m {'train_runtime': 182.3796, 'train_samples_per_second': 8.724, 'epoch': 5.94}\n",
+ "Trial train_distilbert_a247fb2e reported matthews_correlation=0.54 with parameters={'num_train_epochs': 5.933063389003551, 'learning_rate': 1.845204084769373e-05, 'adam_epsilon': 1.372505378696326e-08, 'adam_beta1': 0.8534841230874768, 'adam_beta2': 0.9858475457825921}.\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 16.4/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: c1106c22 with matthews_correlation=0.5451837431775948 and parameters={'num_train_epochs': 6.324445967486241, 'learning_rate': 2.9412189965562634e-05, 'adam_epsilon': 2.256452443236495e-08, 'adam_beta1': 0.880402156178546, 'adam_beta2': 0.9869155143904086}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 8/infinite (1 PENDING, 1 RUNNING, 6 TERMINATED)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Trial train_distilbert_a247fb2e completed. Last result: loss=0.6974263191223145,matthews_correlation=0.5399503104637741\n",
+ "\u001b[2m\u001b[36m(pid=39770)\u001b[0m {'eval_loss': 0.6974263191223145, 'eval_matthews_correlation': 0.5399503104637741, 'eval_runtime': 1.8585, 'eval_samples_per_second': 561.204, 'epoch': 5.94}\n",
+ "\u001b[2m\u001b[36m(pid=7123)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=7123)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=7123)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=7123)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=7123)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=7123)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=7123)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=7123)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=7123)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=7123)\u001b[0m {'train_runtime': 189.7562, 'train_samples_per_second': 8.59, 'epoch': 6.08}\n",
+ "Trial train_distilbert_6e9e8ec2 reported matthews_correlation=0.52 with parameters={'num_train_epochs': 6.078693989748608, 'learning_rate': 1.8357895987910622e-05, 'adam_epsilon': 1.5849146381322022e-08, 'adam_beta1': 0.8904370071918882, 'adam_beta2': 0.9844583428325462}.\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 17.1/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: c1106c22 with matthews_correlation=0.5451837431775948 and parameters={'num_train_epochs': 6.324445967486241, 'learning_rate': 2.9412189965562634e-05, 'adam_epsilon': 2.256452443236495e-08, 'adam_beta1': 0.880402156178546, 'adam_beta2': 0.9869155143904086}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 9/infinite (1 PENDING, 1 RUNNING, 7 TERMINATED)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Trial train_distilbert_6e9e8ec2 completed. Last result: loss=0.7202959656715393,matthews_correlation=0.5185394246694179\n",
+ "\u001b[2m\u001b[36m(pid=7123)\u001b[0m {'eval_loss': 0.7202959656715393, 'eval_matthews_correlation': 0.5185394246694179, 'eval_runtime': 1.6051, 'eval_samples_per_second': 649.814, 'epoch': 6.08}\n",
+ "\u001b[2m\u001b[36m(pid=14798)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=14798)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=14798)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=14798)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=14798)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=14798)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=14798)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=14798)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=14798)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=14798)\u001b[0m {'train_runtime': 329.789, 'train_samples_per_second': 8.448, 'epoch': 10.4}\n",
+ "Trial train_distilbert_e30fd860 reported matthews_correlation=0.54 with parameters={'num_train_epochs': 10.39182109947885, 'learning_rate': 6.762356226483751e-05, 'adam_epsilon': 5.0195217227379364e-08, 'adam_beta1': 0.8951148565195837, 'adam_beta2': 0.9914274194005184}.\n",
+ "\u001b[2m\u001b[36m(pid=14798)\u001b[0m {'eval_loss': 1.505250334739685, 'eval_matthews_correlation': 0.5353569722427551, 'eval_runtime': 1.8314, 'eval_samples_per_second': 569.522, 'epoch': 10.4}\n",
+ "\u001b[2m\u001b[36m(pid=14798)\u001b[0m \n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 15.9/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: c1106c22 with matthews_correlation=0.5451837431775948 and parameters={'num_train_epochs': 6.324445967486241, 'learning_rate': 2.9412189965562634e-05, 'adam_epsilon': 2.256452443236495e-08, 'adam_beta1': 0.880402156178546, 'adam_beta2': 0.9869155143904086}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 10/infinite (1 PENDING, 1 RUNNING, 8 TERMINATED)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Trial train_distilbert_e30fd860 completed. Last result: loss=1.505250334739685,matthews_correlation=0.5353569722427551\n",
+ "\u001b[2m\u001b[36m(pid=27867)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=27867)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=27867)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=27867)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=27867)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=27867)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=27867)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=27867)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=27867)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=27867)\u001b[0m {'train_runtime': 259.759, 'train_samples_per_second': 9.078, 'epoch': 8.8}\n",
+ "Trial train_distilbert_5bddb1ae reported matthews_correlation=0.55 with parameters={'num_train_epochs': 8.797715187430134, 'learning_rate': 2.72412577596775e-05, 'adam_epsilon': 7.4151444539151255e-09, 'adam_beta1': 0.869942964703411, 'adam_beta2': 0.9852670758817403}.\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 16.8/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: 5bddb1ae with matthews_correlation=0.5492247863049868 and parameters={'num_train_epochs': 8.797715187430134, 'learning_rate': 2.72412577596775e-05, 'adam_epsilon': 7.4151444539151255e-09, 'adam_beta1': 0.869942964703411, 'adam_beta2': 0.9852670758817403}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 11/infinite (1 PENDING, 1 RUNNING, 9 TERMINATED)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Trial train_distilbert_5bddb1ae completed. Last result: loss=1.0900800228118896,matthews_correlation=0.5492247863049868\n",
+ "\u001b[2m\u001b[36m(pid=27867)\u001b[0m {'eval_loss': 1.0900800228118896, 'eval_matthews_correlation': 0.5492247863049868, 'eval_runtime': 1.6198, 'eval_samples_per_second': 643.889, 'epoch': 8.8}\n",
+ "\u001b[2m\u001b[36m(pid=38727)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=38727)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=38727)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=38727)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=38727)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=38727)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=38727)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=38727)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=38727)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=38727)\u001b[0m {'train_runtime': 251.169, 'train_samples_per_second': 8.544, 'epoch': 8.01}\n",
+ "Trial train_distilbert_27da6108 reported matthews_correlation=0.55 with parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}.\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 16.1/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: 27da6108 with matthews_correlation=0.550740569901542 and parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 12/infinite (1 PENDING, 1 RUNNING, 10 TERMINATED)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Trial train_distilbert_27da6108 completed. Last result: loss=0.8646725416183472,matthews_correlation=0.550740569901542\n",
+ "\u001b[2m\u001b[36m(pid=38727)\u001b[0m {'eval_loss': 0.8646725416183472, 'eval_matthews_correlation': 0.550740569901542, 'eval_runtime': 1.7453, 'eval_samples_per_second': 597.588, 'epoch': 8.01}\n",
+ "\u001b[2m\u001b[36m(pid=8698)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=8698)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=8698)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=8698)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=8698)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=8698)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=8698)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=8698)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=8698)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=8698)\u001b[0m {'train_runtime': 150.7963, 'train_samples_per_second': 8.641, 'epoch': 4.86}\n",
+ "Trial train_distilbert_ca4167f2 reported matthews_correlation=0.55 with parameters={'num_train_epochs': 4.8609021804212205, 'learning_rate': 3.0765755916918634e-05, 'adam_epsilon': 3.2784085089990583e-09, 'adam_beta1': 0.9001311340399742, 'adam_beta2': 0.9865549219923857}.\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 16.7/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: 27da6108 with matthews_correlation=0.550740569901542 and parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 13/infinite (1 PENDING, 1 RUNNING, 11 TERMINATED)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Trial train_distilbert_ca4167f2 completed. Last result: loss=0.7426601052284241,matthews_correlation=0.5474713423103301\n",
+ "\u001b[2m\u001b[36m(pid=8698)\u001b[0m {'eval_loss': 0.7426601052284241, 'eval_matthews_correlation': 0.5474713423103301, 'eval_runtime': 1.6955, 'eval_samples_per_second': 615.172, 'epoch': 4.86}\n",
+ "\u001b[2m\u001b[36m(pid=26401)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=26401)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=26401)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=26401)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=26401)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=26401)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=26401)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=26401)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=26401)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=26401)\u001b[0m {'train_runtime': 168.574, 'train_samples_per_second': 8.56, 'epoch': 5.38}\n",
+ "Trial train_distilbert_6776ad66 reported matthews_correlation=0.50 with parameters={'num_train_epochs': 5.381515555130151, 'learning_rate': 1.4923436298344364e-05, 'adam_epsilon': 4.718609673277113e-08, 'adam_beta1': 0.8855356638050199, 'adam_beta2': 0.9817714112199931}.\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 15.7/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: 27da6108 with matthews_correlation=0.550740569901542 and parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 14/infinite (1 PENDING, 1 RUNNING, 12 TERMINATED)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\u001b[2m\u001b[36m(pid=26401)\u001b[0m {'eval_loss': 0.6062898635864258, 'eval_matthews_correlation': 0.5039642659976749, 'eval_runtime': 1.8481, 'eval_samples_per_second': 564.358, 'epoch': 5.38}\n",
+ "Trial train_distilbert_6776ad66 completed. Last result: loss=0.6062898635864258,matthews_correlation=0.5039642659976749\n",
+ "\u001b[2m\u001b[36m(pid=36494)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=36494)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=36494)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=36494)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=36494)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=36494)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=36494)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=36494)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=36494)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=36494)\u001b[0m {'train_runtime': 267.304, 'train_samples_per_second': 8.694, 'epoch': 8.67}\n",
+ "Trial train_distilbert_c904a63c reported matthews_correlation=0.54 with parameters={'num_train_epochs': 8.670157213614129, 'learning_rate': 3.589310669581693e-05, 'adam_epsilon': 1e-07, 'adam_beta1': 0.9159421419473668, 'adam_beta2': 0.9870278515925665}.\n",
+ "\u001b[2m\u001b[36m(pid=36494)\u001b[0m {'eval_loss': 1.15528404712677, 'eval_matthews_correlation': 0.541934635424655, 'eval_runtime': 1.8046, 'eval_samples_per_second': 577.975, 'epoch': 8.67}\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 16.4/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: 27da6108 with matthews_correlation=0.550740569901542 and parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 15/infinite (1 PENDING, 1 RUNNING, 13 TERMINATED)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Trial train_distilbert_c904a63c completed. Last result: loss=1.15528404712677,matthews_correlation=0.541934635424655\n",
+ "\u001b[2m\u001b[36m(pid=7128)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=7128)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=7128)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=7128)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=7128)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=7128)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=7128)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=7128)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=7128)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=7128)\u001b[0m {'train_runtime': 401.1267, 'train_samples_per_second': 8.808, 'epoch': 13.18}\n",
+ "Trial train_distilbert_34cd23b2 reported matthews_correlation=0.54 with parameters={'num_train_epochs': 13.180325143440442, 'learning_rate': 1.1392631517503339e-05, 'adam_epsilon': 8.551227707433237e-08, 'adam_beta1': 0.8917360114521684, 'adam_beta2': 0.9933954023113967}.\n",
+ "\u001b[2m\u001b[36m(pid=7128)\u001b[0m {'eval_loss': 0.9118097424507141, 'eval_matthews_correlation': 0.5361146089547957, 'eval_runtime': 1.6269, 'eval_samples_per_second': 641.089, 'epoch': 13.18}\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 16.4/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: 27da6108 with matthews_correlation=0.550740569901542 and parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 16/infinite (1 PENDING, 1 RUNNING, 14 TERMINATED)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Trial train_distilbert_34cd23b2 completed. Last result: loss=0.9118097424507141,matthews_correlation=0.5361146089547957\n",
+ "\u001b[2m\u001b[36m(pid=23493)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=23493)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=23493)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=23493)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=23493)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=23493)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=23493)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=23493)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=23493)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=23493)\u001b[0m {'train_runtime': 261.9267, 'train_samples_per_second': 8.548, 'epoch': 8.35}\n",
+ "Trial train_distilbert_dbc01c60 reported matthews_correlation=0.53 with parameters={'num_train_epochs': 8.351740081197375, 'learning_rate': 4.14474164779562e-05, 'adam_epsilon': 2.5536744573294183e-08, 'adam_beta1': 0.9010345773126118, 'adam_beta2': 0.98213801095907}.\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 16.0/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: 27da6108 with matthews_correlation=0.550740569901542 and parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 17/infinite (1 PENDING, 1 RUNNING, 15 TERMINATED)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Trial train_distilbert_dbc01c60 completed. Last result: loss=1.270609974861145,matthews_correlation=0.5331291095663535\n",
+ "\u001b[2m\u001b[36m(pid=23493)\u001b[0m {'eval_loss': 1.270609974861145, 'eval_matthews_correlation': 0.5331291095663535, 'eval_runtime': 1.7863, 'eval_samples_per_second': 583.876, 'epoch': 8.35}\n",
+ "\u001b[2m\u001b[36m(pid=33982)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
+ "\u001b[2m\u001b[36m(pid=33982)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
+ "\u001b[2m\u001b[36m(pid=33982)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
+ "\u001b[2m\u001b[36m(pid=33982)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
+ "\u001b[2m\u001b[36m(pid=33982)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
+ "\u001b[2m\u001b[36m(pid=33982)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "\u001b[2m\u001b[36m(pid=33982)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "\u001b[2m\u001b[36m(pid=33982)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
+ "\u001b[2m\u001b[36m(pid=33982)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "\u001b[2m\u001b[36m(pid=33982)\u001b[0m {'train_runtime': 307.947, 'train_samples_per_second': 8.501, 'epoch': 9.77}\n",
+ "2021-02-24 15:01:18,861\tINFO stopper.py:193 -- Reached timeout of 3600 seconds. Stopping all trials.\n",
+ "Trial train_distilbert_d1e00f7e reported matthews_correlation=0.50 with parameters={'num_train_epochs': 9.768470529742105, 'learning_rate': 7.278242504625585e-06, 'adam_epsilon': 9.024121328462365e-08, 'adam_beta1': 0.9568651413276459, 'adam_beta2': 0.9898624818542463}.\n",
+ "\u001b[2m\u001b[36m(pid=33982)\u001b[0m {'eval_loss': 0.6356746554374695, 'eval_matthews_correlation': 0.502884728860933, 'eval_runtime': 1.7441, 'eval_samples_per_second': 598.03, 'epoch': 9.77}\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 15.9/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/4 CPUs, 0/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: 27da6108 with matthews_correlation=0.550740569901542 and parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 18/infinite (18 TERMINATED)
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": "
Memory usage on this node: 15.9/251.8 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/4 CPUs, 0/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)
Current best trial: 27da6108 with matthews_correlation=0.550740569901542 and parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}
Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21
Number of trials: 18/infinite (18 TERMINATED)\n\n
\n\n\nTrial name status loc adam_beta1 adam_beta2 adam_epsilon learning_rate num_train_epochs iter total time (s) loss matthews_correlation \ntrain_distilbert_21b2c490 TERMINATED 0.939079 0.991865 7.96945e-08 5.61152e-06 1 1 46.9698 0.578651 0 \ntrain_distilbert_21b2c491 TERMINATED 0.910086 0.985079 1.24281e-08 3.13454e-05 6.49666 1 215.872 0.991096 0.509303 \ntrain_distilbert_3f0da820 TERMINATED 0.909395 0.993715 1e-07 5.26543e-06 1 1 47.3068 0.577507 0 \ntrain_distilbert_c1106c22 TERMINATED 0.880402 0.986916 2.25645e-08 2.94122e-05 6.32445 1 207.618 0.893973 0.545184 \ntrain_distilbert_de95f5e6 TERMINATED 0.962889 0.983219 6.09235e-09 3.01587e-05 3.0976 1 115.872 0.572089 0.483692 \ntrain_distilbert_5bb0a1fc TERMINATED 0.845137 0.988217 5.04509e-08 5.8581e-05 10.7555 1 340.281 1.50753 0.52824 \ntrain_distilbert_a247fb2e TERMINATED 0.853484 0.985848 1.37251e-08 1.8452e-05 5.93306 1 192.779 0.697426 0.53995 \ntrain_distilbert_6e9e8ec2 TERMINATED 0.890437 0.984458 1.58491e-08 1.83579e-05 6.07869 1 200.122 0.720296 0.518539 \ntrain_distilbert_e30fd860 TERMINATED 0.895115 0.991427 5.01952e-08 6.76236e-05 10.3918 1 339.615 1.50525 0.535357 \ntrain_distilbert_5bddb1ae TERMINATED 0.869943 0.985267 7.41514e-09 2.72413e-05 8.79772 1 269.864 1.09008 0.549225 \ntrain_distilbert_27da6108 TERMINATED 0.911674 0.98694 6.69698e-08 1.93183e-05 8.00568 1 261.261 0.864673 0.550741 \ntrain_distilbert_ca4167f2 TERMINATED 0.900131 0.986555 3.27841e-09 3.07658e-05 4.8609 1 161.146 0.74266 0.547471 \ntrain_distilbert_6776ad66 TERMINATED 0.885536 0.981771 4.71861e-08 1.49234e-05 5.38152 1 178.269 0.60629 0.503964 \ntrain_distilbert_c904a63c TERMINATED 0.915942 0.987028 1e-07 3.58931e-05 8.67016 1 277.56 1.15528 0.541935 \ntrain_distilbert_34cd23b2 TERMINATED 0.891736 0.993395 8.55123e-08 1.13926e-05 13.1803 1 410.4 0.91181 0.536115 \ntrain_distilbert_dbc01c60 TERMINATED 0.901035 0.982138 2.55367e-08 4.14474e-05 8.35174 1 272.136 1.27061 0.533129 \ntrain_distilbert_d1e00f7e TERMINATED 0.956865 0.989862 9.02412e-08 7.27824e-06 9.76847 1 317.557 0.635675 0.502885 \n\ntrain_distilbert_759d8c04 TERMINATED 0.852308 0.986484 3.65877e-08 2.91155e-05 4.12326
"
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "2021-02-24 15:01:18,957\tINFO tune.py:448 -- Total run time: 3897.00 seconds (3896.97 seconds for the tuning loop).\n"
+ ]
+ }
+ ],
"source": [
"import time\n",
"import ray\n",
"start_time = time.time()\n",
+ "ray.shutdown()\n",
"ray.init(num_cpus=num_cpus, num_gpus=num_gpus)\n",
"\n",
"print(\"Tuning started...\")\n",
@@ -734,7 +1153,7 @@
" report_intermediate_result=False,\n",
" # uncomment the following if report_intermediate_result = True\n",
" # max_resource=max_num_epoch, min_resource=1,\n",
- " resources_per_trial={\"gpu\": 1},\n",
+ " resources_per_trial={\"gpu\": num_gpus, \"cpu\": num_cpus},\n",
" local_dir='logs/',\n",
" num_samples=num_samples,\n",
" time_budget_s=time_budget_s,\n",
@@ -746,9 +1165,17 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 127,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "n_trials=18\ntime=3903.5583679676056\nBest model eval matthews_correlation: 0.5507\nBest model parameters: {'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}\n"
+ ]
+ }
+ ],
"source": [
"best_trial = analysis.get_best_trial(HP_METRIC, MODE, \"all\")\n",
"metric = best_trial.metric_analysis[HP_METRIC][MODE]\n",
@@ -765,7 +1192,7 @@
"Notice that we only reported the metric with `flaml.tune.report` at the end of full training loop. It is possible to enable reporting of intermediate performance - allowing early stopping - as follows:\n",
"\n",
"- Huggingface provides _Callbacks_ which can be used to insert the `flaml.tune.report` call inside the training loop\n",
- "- Make sure to set `do_eval=True` in the `TrainingArguments` provided to `Trainer` and adjust theevaluation frequency accordingly"
+ "- Make sure to set `do_eval=True` in the `TrainingArguments` provided to `Trainer` and adjust the evaluation frequency accordingly"
],
"cell_type": "markdown",
"metadata": {}
@@ -774,12 +1201,8 @@
"metadata": {
"kernelspec": {
"name": "python3",
- "display_name": "Python 3.7.7 64-bit ('flaml': conda)",
- "metadata": {
- "interpreter": {
- "hash": "bfcd9a6a9254a5e160761a1fd7a9e444f011592c6770d9f4180dde058a9df5dd"
- }
- }
+ "display_name": "Python 3",
+ "language": "python"
},
"language_info": {
"codemirror_mode": {
@@ -791,7 +1214,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.7-final"
+ "version": "3.7.9-final"
}
},
"nbformat": 4,
diff --git a/notebook/flaml_xgboost.ipynb b/notebook/flaml_xgboost.ipynb
new file mode 100644
index 000000000..425414daf
--- /dev/null
+++ b/notebook/flaml_xgboost.ipynb
@@ -0,0 +1,556 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. \n",
+ "\n",
+ "Licensed under the MIT License.\n",
+ "\n",
+ "# Tune XGBoost with FLAML Library\n",
+ "\n",
+ "\n",
+ "## 1. Introduction\n",
+ "\n",
+ "FLAML is a Python library (https://github.com/microsoft/FLAML) designed to automatically produce accurate machine learning models \n",
+ "with low computational cost. It is fast and cheap. The simple and lightweight design makes it easy \n",
+ "to use and extend, such as adding new learners. FLAML can \n",
+ "- serve as an economical AutoML engine,\n",
+ "- be used as a fast hyperparameter tuning tool, or \n",
+ "- be embedded in self-tuning software that requires low latency & resource in repetitive\n",
+ " tuning tasks.\n",
+ "\n",
+ "In this notebook, we demonstrate how to use FLAML library to tune hyperparameters of XGBoost with a regression example.\n",
+ "\n",
+ "FLAML requires `Python>=3.6`. To run this notebook example, please install flaml with the `notebook` option:\n",
+ "```bash\n",
+ "pip install flaml[notebook]\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install flaml[notebook];"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## 2. Regression Example\n",
+ "### Load data and preprocess\n",
+ "\n",
+ "Download [houses dataset](https://www.openml.org/d/537) from OpenML. The task is to predict median price of the house in the region based on demographic composition and a state of housing market in the region."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "subslide"
+ },
+ "tags": []
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "load dataset from ./openml_ds537.pkl\nDataset name: houses\nX_train.shape: (15480, 8), y_train.shape: (15480,);\nX_test.shape: (5160, 8), y_test.shape: (5160,)\n"
+ ]
+ }
+ ],
+ "source": [
+ "from flaml.data import load_openml_dataset\n",
+ "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id = 537, data_dir = './')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "### Run FLAML\n",
+ "In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "''' import AutoML class from flaml package '''\n",
+ "from flaml import AutoML\n",
+ "automl = AutoML()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "settings = {\n",
+ " \"time_budget\": 60, # total running time in seconds\n",
+ " \"metric\": 'r2', # primary metrics for regression can be chosen from: ['mae','mse','r2']\n",
+ " \"estimator_list\": ['xgboost'], # list of ML learners; we tune xgboost in this example\n",
+ " \"task\": 'regression', # task type \n",
+ " \"log_file_name\": 'houses_experiment.log', # flaml log file\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ },
+ "tags": []
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "[flaml.automl: 02-23 14:54:34] {853} INFO - Evaluation method: cv\n",
+ "INFO - Evaluation method: cv\n",
+ "[flaml.automl: 02-23 14:54:34] {577} INFO - Using RepeatedKFold\n",
+ "INFO - Using RepeatedKFold\n",
+ "[flaml.automl: 02-23 14:54:34] {874} INFO - Minimizing error metric: 1-r2\n",
+ "INFO - Minimizing error metric: 1-r2\n",
+ "[flaml.automl: 02-23 14:54:34] {894} INFO - List of ML learners in AutoML Run: ['xgboost']\n",
+ "INFO - List of ML learners in AutoML Run: ['xgboost']\n",
+ "[flaml.automl: 02-23 14:54:34] {953} INFO - iteration 0 current learner xgboost\n",
+ "INFO - iteration 0 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:35] {1107} INFO - at 1.3s,\tbest xgboost's error=2.1267,\tbest xgboost's error=2.1267\n",
+ "INFO - at 1.3s,\tbest xgboost's error=2.1267,\tbest xgboost's error=2.1267\n",
+ "[flaml.automl: 02-23 14:54:35] {953} INFO - iteration 1 current learner xgboost\n",
+ "INFO - iteration 1 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:35] {1107} INFO - at 1.4s,\tbest xgboost's error=2.1267,\tbest xgboost's error=2.1267\n",
+ "INFO - at 1.4s,\tbest xgboost's error=2.1267,\tbest xgboost's error=2.1267\n",
+ "[flaml.automl: 02-23 14:54:35] {953} INFO - iteration 2 current learner xgboost\n",
+ "INFO - iteration 2 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:36] {1107} INFO - at 1.5s,\tbest xgboost's error=0.4565,\tbest xgboost's error=0.4565\n",
+ "INFO - at 1.5s,\tbest xgboost's error=0.4565,\tbest xgboost's error=0.4565\n",
+ "[flaml.automl: 02-23 14:54:36] {953} INFO - iteration 3 current learner xgboost\n",
+ "INFO - iteration 3 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:36] {1107} INFO - at 1.6s,\tbest xgboost's error=0.4565,\tbest xgboost's error=0.4565\n",
+ "INFO - at 1.6s,\tbest xgboost's error=0.4565,\tbest xgboost's error=0.4565\n",
+ "[flaml.automl: 02-23 14:54:36] {953} INFO - iteration 4 current learner xgboost\n",
+ "INFO - iteration 4 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:36] {1107} INFO - at 1.9s,\tbest xgboost's error=0.2697,\tbest xgboost's error=0.2697\n",
+ "INFO - at 1.9s,\tbest xgboost's error=0.2697,\tbest xgboost's error=0.2697\n",
+ "[flaml.automl: 02-23 14:54:36] {953} INFO - iteration 5 current learner xgboost\n",
+ "INFO - iteration 5 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:36] {1107} INFO - at 2.1s,\tbest xgboost's error=0.2278,\tbest xgboost's error=0.2278\n",
+ "INFO - at 2.1s,\tbest xgboost's error=0.2278,\tbest xgboost's error=0.2278\n",
+ "[flaml.automl: 02-23 14:54:36] {953} INFO - iteration 6 current learner xgboost\n",
+ "INFO - iteration 6 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:36] {1107} INFO - at 2.2s,\tbest xgboost's error=0.2278,\tbest xgboost's error=0.2278\n",
+ "INFO - at 2.2s,\tbest xgboost's error=0.2278,\tbest xgboost's error=0.2278\n",
+ "[flaml.automl: 02-23 14:54:36] {953} INFO - iteration 7 current learner xgboost\n",
+ "INFO - iteration 7 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:36] {1107} INFO - at 2.5s,\tbest xgboost's error=0.2228,\tbest xgboost's error=0.2228\n",
+ "INFO - at 2.5s,\tbest xgboost's error=0.2228,\tbest xgboost's error=0.2228\n",
+ "[flaml.automl: 02-23 14:54:36] {953} INFO - iteration 8 current learner xgboost\n",
+ "INFO - iteration 8 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:37] {1107} INFO - at 2.6s,\tbest xgboost's error=0.2228,\tbest xgboost's error=0.2228\n",
+ "INFO - at 2.6s,\tbest xgboost's error=0.2228,\tbest xgboost's error=0.2228\n",
+ "[flaml.automl: 02-23 14:54:37] {953} INFO - iteration 9 current learner xgboost\n",
+ "INFO - iteration 9 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:37] {1107} INFO - at 2.8s,\tbest xgboost's error=0.2228,\tbest xgboost's error=0.2228\n",
+ "INFO - at 2.8s,\tbest xgboost's error=0.2228,\tbest xgboost's error=0.2228\n",
+ "[flaml.automl: 02-23 14:54:37] {953} INFO - iteration 10 current learner xgboost\n",
+ "INFO - iteration 10 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:37] {1107} INFO - at 3.0s,\tbest xgboost's error=0.2228,\tbest xgboost's error=0.2228\n",
+ "INFO - at 3.0s,\tbest xgboost's error=0.2228,\tbest xgboost's error=0.2228\n",
+ "[flaml.automl: 02-23 14:54:37] {953} INFO - iteration 11 current learner xgboost\n",
+ "INFO - iteration 11 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:38] {1107} INFO - at 3.6s,\tbest xgboost's error=0.2228,\tbest xgboost's error=0.2228\n",
+ "INFO - at 3.6s,\tbest xgboost's error=0.2228,\tbest xgboost's error=0.2228\n",
+ "[flaml.automl: 02-23 14:54:38] {953} INFO - iteration 12 current learner xgboost\n",
+ "INFO - iteration 12 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:38] {1107} INFO - at 4.1s,\tbest xgboost's error=0.2228,\tbest xgboost's error=0.2228\n",
+ "INFO - at 4.1s,\tbest xgboost's error=0.2228,\tbest xgboost's error=0.2228\n",
+ "[flaml.automl: 02-23 14:54:38] {953} INFO - iteration 13 current learner xgboost\n",
+ "INFO - iteration 13 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:38] {1107} INFO - at 4.2s,\tbest xgboost's error=0.2228,\tbest xgboost's error=0.2228\n",
+ "INFO - at 4.2s,\tbest xgboost's error=0.2228,\tbest xgboost's error=0.2228\n",
+ "[flaml.automl: 02-23 14:54:38] {953} INFO - iteration 14 current learner xgboost\n",
+ "INFO - iteration 14 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:39] {1107} INFO - at 4.9s,\tbest xgboost's error=0.1814,\tbest xgboost's error=0.1814\n",
+ "INFO - at 4.9s,\tbest xgboost's error=0.1814,\tbest xgboost's error=0.1814\n",
+ "[flaml.automl: 02-23 14:54:39] {953} INFO - iteration 15 current learner xgboost\n",
+ "INFO - iteration 15 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:39] {1107} INFO - at 5.2s,\tbest xgboost's error=0.1814,\tbest xgboost's error=0.1814\n",
+ "INFO - at 5.2s,\tbest xgboost's error=0.1814,\tbest xgboost's error=0.1814\n",
+ "[flaml.automl: 02-23 14:54:39] {953} INFO - iteration 16 current learner xgboost\n",
+ "INFO - iteration 16 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:46] {1107} INFO - at 12.3s,\tbest xgboost's error=0.1813,\tbest xgboost's error=0.1813\n",
+ "INFO - at 12.3s,\tbest xgboost's error=0.1813,\tbest xgboost's error=0.1813\n",
+ "[flaml.automl: 02-23 14:54:46] {953} INFO - iteration 17 current learner xgboost\n",
+ "INFO - iteration 17 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:54:51] {1107} INFO - at 17.5s,\tbest xgboost's error=0.1642,\tbest xgboost's error=0.1642\n",
+ "INFO - at 17.5s,\tbest xgboost's error=0.1642,\tbest xgboost's error=0.1642\n",
+ "[flaml.automl: 02-23 14:54:51] {953} INFO - iteration 18 current learner xgboost\n",
+ "INFO - iteration 18 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:55:04] {1107} INFO - at 30.4s,\tbest xgboost's error=0.1642,\tbest xgboost's error=0.1642\n",
+ "INFO - at 30.4s,\tbest xgboost's error=0.1642,\tbest xgboost's error=0.1642\n",
+ "[flaml.automl: 02-23 14:55:04] {953} INFO - iteration 19 current learner xgboost\n",
+ "INFO - iteration 19 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:55:06] {1107} INFO - at 32.1s,\tbest xgboost's error=0.1642,\tbest xgboost's error=0.1642\n",
+ "INFO - at 32.1s,\tbest xgboost's error=0.1642,\tbest xgboost's error=0.1642\n",
+ "[flaml.automl: 02-23 14:55:06] {953} INFO - iteration 20 current learner xgboost\n",
+ "INFO - iteration 20 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:55:10] {1107} INFO - at 35.7s,\tbest xgboost's error=0.1642,\tbest xgboost's error=0.1642\n",
+ "INFO - at 35.7s,\tbest xgboost's error=0.1642,\tbest xgboost's error=0.1642\n",
+ "[flaml.automl: 02-23 14:55:10] {953} INFO - iteration 21 current learner xgboost\n",
+ "INFO - iteration 21 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:55:11] {1107} INFO - at 36.7s,\tbest xgboost's error=0.1642,\tbest xgboost's error=0.1642\n",
+ "INFO - at 36.7s,\tbest xgboost's error=0.1642,\tbest xgboost's error=0.1642\n",
+ "[flaml.automl: 02-23 14:55:11] {953} INFO - iteration 22 current learner xgboost\n",
+ "INFO - iteration 22 current learner xgboost\n",
+ "[flaml.automl: 02-23 14:55:34] {1107} INFO - at 59.7s,\tbest xgboost's error=0.1601,\tbest xgboost's error=0.1601\n",
+ "INFO - at 59.7s,\tbest xgboost's error=0.1601,\tbest xgboost's error=0.1601\n",
+ "[flaml.automl: 02-23 14:55:34] {1148} INFO - selected model: