* xgboost notebook

* finetuning notebook

* finetuning test

* experimental nni support

* support nested search space

* log file name

* record training_iteration

* eps

* reset times

* std set to default step size if 0
This commit is contained in:
Chi Wang
2021-02-28 12:43:43 -08:00
committed by GitHub
parent 6ff0ed434b
commit 7bd231e497
12 changed files with 1370 additions and 220 deletions

View File

@@ -15,6 +15,17 @@ try:
Trainer,
TrainingArguments,
)
MODEL_CHECKPOINT = "distilbert-base-uncased"
TASK = "cola"
NUM_LABELS = 2
COLUMN_NAME = "sentence"
METRIC_NAME = "matthews_correlation"
# HP_METRIC, MODE = "loss", "min"
HP_METRIC, MODE = "matthews_correlation", "max"
# Define tokenize method
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)
except:
print("pip install torch transformers datasets flaml[blendsearch,ray]")
@@ -25,24 +36,21 @@ logger.setLevel(logging.INFO)
import flaml
MODEL_CHECKPOINT = "distilbert-base-uncased"
TASK = "cola"
NUM_LABELS = 2
COLUMN_NAME = "sentence"
METRIC_NAME = "matthews_correlation"
# HP_METRIC, MODE = "loss", "min"
HP_METRIC, MODE = "matthews_correlation", "max"
def train_distilbert(config: dict):
# Define tokenize method
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)
metric = load_metric("glue", TASK)
def tokenize(examples):
return tokenizer(examples[COLUMN_NAME], truncation=True)
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)
# Load CoLA dataset and apply tokenizer
cola_raw = load_dataset("glue", TASK)
cola_encoded = cola_raw.map(tokenize, batched=True)
train_dataset, eval_dataset = cola_encoded["train"], cola_encoded["validation"]
@@ -50,13 +58,6 @@ def train_distilbert(config: dict):
MODEL_CHECKPOINT, num_labels=NUM_LABELS
)
metric = load_metric("glue", TASK)
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)
training_args = TrainingArguments(
output_dir='.',
do_eval=False,
@@ -91,7 +92,7 @@ def _test_distillbert(method='BlendSearch'):
max_num_epoch = 64
num_samples = -1
time_budget_s = 10800
time_budget_s = 3600
search_space = {
# You can mix constants with search space objects.
@@ -123,7 +124,7 @@ def _test_distillbert(method='BlendSearch'):
from flaml import BlendSearch
algo = BlendSearch(points_to_evaluate=[{
"num_train_epochs": 1,
}])
}])
elif 'Dragonfly' == method:
from ray.tune.suggest.dragonfly import DragonflySearch
algo = DragonflySearch()
@@ -139,7 +140,7 @@ def _test_distillbert(method='BlendSearch'):
algo = ZOOptSearch(budget=num_samples)
elif 'Ax' == method:
from ray.tune.suggest.ax import AxSearch
algo = AxSearch()
algo = AxSearch(max_concurrent=3)
elif 'HyperOpt' == method:
from ray.tune.suggest.hyperopt import HyperOptSearch
algo = HyperOptSearch()
@@ -154,8 +155,7 @@ def _test_distillbert(method='BlendSearch'):
train_distilbert,
metric=HP_METRIC,
mode=MODE,
# You can add "gpu": 1 to allocate GPUs
resources_per_trial={"gpu": 1},
resources_per_trial={"gpu": 4, "cpu": 4},
config=search_space, local_dir='test/logs/',
num_samples=num_samples, time_budget_s=time_budget_s,
keep_checkpoints_num=1, checkpoint_score_attr=HP_METRIC,

View File

@@ -49,7 +49,6 @@ def _test_xgboost(method='BlendSearch'):
else:
from ray import tune
search_space = {
# You can mix constants with search space objects.
"max_depth": tune.randint(1, 8) if method in [
"BlendSearch", "BOHB", "Optuna"] else tune.randint(1, 9),
"min_child_weight": tune.choice([1, 2, 3]),
@@ -154,6 +153,33 @@ def _test_xgboost(method='BlendSearch'):
logger.info(f"Best model parameters: {best_trial.config}")
def test_nested():
from flaml import tune
search_space = {
# test nested search space
"cost_related": {
"a": tune.randint(1, 8),
},
"b": tune.uniform(0.5, 1.0),
}
def simple_func(config):
tune.report(
metric=(config["cost_related"]["a"]-4)**2 * (config["b"]-0.7)**2)
analysis = tune.run(
simple_func,
init_config={
"cost_related": {"a": 1,}
},
metric="metric",
mode="min",
config=search_space,
local_dir='logs/',
num_samples=-1,
time_budget_s=1)
def test_xgboost_bs():
_test_xgboost()