random search (#213)

* random search as a child class of CFO

* random search in sequential search of AutoML

* time to find best model as a property of AutoML
This commit is contained in:
Chi Wang
2021-09-19 11:19:23 -07:00
committed by GitHub
parent 0ba58e0ace
commit f3e50136e8
6 changed files with 122 additions and 59 deletions

View File

@@ -1,50 +1,66 @@
from openml.exceptions import OpenMLServerException
def test_automl(budget=5, dataset_format='dataframe', hpo_method=None):
def test_automl(budget=5, dataset_format="dataframe", hpo_method=None):
from flaml.data import load_openml_dataset
try:
X_train, X_test, y_train, y_test = load_openml_dataset(
dataset_id=1169, data_dir='test/', dataset_format=dataset_format)
dataset_id=1169, data_dir="test/", dataset_format=dataset_format
)
except OpenMLServerException:
print("OpenMLServerException raised")
return
''' import AutoML class from flaml package '''
""" import AutoML class from flaml package """
from flaml import AutoML
automl = AutoML()
settings = {
"time_budget": budget, # total running time in seconds
"metric": 'accuracy', # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2']
"task": 'classification', # task type
"log_file_name": 'airlines_experiment.log', # flaml log file
"seed": 7654321, # random seed
'hpo_method': hpo_method
"metric": "accuracy", # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2']
"task": "classification", # task type
"log_file_name": "airlines_experiment.log", # flaml log file
"seed": 7654321, # random seed
"hpo_method": hpo_method,
}
'''The main flaml automl API'''
"""The main flaml automl API"""
automl.fit(X_train=X_train, y_train=y_train, **settings)
''' retrieve best config and best learner'''
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1 - automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))
""" retrieve best config and best learner """
print("Best ML leaner:", automl.best_estimator)
print("Best hyperparmeter config:", automl.best_config)
print("Best accuracy on validation data: {0:.4g}".format(1 - automl.best_loss))
print(
"Training duration of best run: {0:.4g} s".format(automl.best_config_train_time)
)
print(automl.model.estimator)
''' pickle and save the automl object '''
print("time taken to find best model:", automl.time_to_find_best_model)
""" pickle and save the automl object """
import pickle
with open('automl.pkl', 'wb') as f:
with open("automl.pkl", "wb") as f:
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
''' compute predictions of testing dataset '''
""" compute predictions of testing dataset """
y_pred = automl.predict(X_test)
print('Predicted labels', y_pred)
print('True labels', y_test)
print("Predicted labels", y_pred)
print("True labels", y_test)
y_pred_proba = automl.predict_proba(X_test)[:, 1]
''' compute different metric values on testing dataset'''
""" compute different metric values on testing dataset """
from flaml.ml import sklearn_metric_loss_score
print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))
print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))
print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))
print("accuracy", "=", 1 - sklearn_metric_loss_score("accuracy", y_pred, y_test))
print(
"roc_auc", "=", 1 - sklearn_metric_loss_score("roc_auc", y_pred_proba, y_test)
)
print("log_loss", "=", sklearn_metric_loss_score("log_loss", y_pred_proba, y_test))
from flaml.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
get_output_from_log(filename=settings['log_file_name'], time_budget=60)
(
time_history,
best_valid_loss_history,
valid_loss_history,
config_history,
metric_history,
) = get_output_from_log(filename=settings["log_file_name"], time_budget=60)
for config in config_history:
print(config)
print(automl.prune_attr)
@@ -53,37 +69,40 @@ def test_automl(budget=5, dataset_format='dataframe', hpo_method=None):
def test_automl_array():
test_automl(5, 'array', 'bs')
test_automl(5, "array", "bs")
def test_mlflow():
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "mlflow"])
import mlflow
from flaml.data import load_openml_task
try:
X_train, X_test, y_train, y_test = load_openml_task(
task_id=7592, data_dir='test/')
task_id=7592, data_dir="test/"
)
except OpenMLServerException:
print("OpenMLServerException raised")
return
''' import AutoML class from flaml package '''
""" import AutoML class from flaml package """
from flaml import AutoML
automl = AutoML()
settings = {
"time_budget": 5, # total running time in seconds
"metric": 'accuracy', # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2']
"estimator_list": ['lgbm', 'rf', 'xgboost'], # list of ML learners
"task": 'classification', # task type
"metric": "accuracy", # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2']
"estimator_list": ["lgbm", "rf", "xgboost"], # list of ML learners
"task": "classification", # task type
"sample": False, # whether to subsample training data
"log_file_name": 'adult.log', # flaml log file
"log_file_name": "adult.log", # flaml log file
}
mlflow.set_experiment("flaml")
with mlflow.start_run():
'''The main flaml automl API'''
automl.fit(
X_train=X_train, y_train=y_train, **settings)
"""The main flaml automl API"""
automl.fit(X_train=X_train, y_train=y_train, **settings)
# subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "mlflow"])
automl._mem_thres = 0
print(automl.trainable(automl.points_to_evaluate[0]))

View File

@@ -12,58 +12,63 @@ dataset = "credit-g"
class XGBoost2D(XGBoostSklearnEstimator):
@classmethod
def search_space(cls, data_size, task):
upper = min(32768, int(data_size))
return {
'n_estimators': {
'domain': tune.lograndint(lower=4, upper=upper),
'low_cost_init_value': 4,
"n_estimators": {
"domain": tune.lograndint(lower=4, upper=upper),
"low_cost_init_value": 4,
},
'max_leaves': {
'domain': tune.lograndint(lower=4, upper=upper),
'low_cost_init_value': 4,
"max_leaves": {
"domain": tune.lograndint(lower=4, upper=upper),
"low_cost_init_value": 4,
},
}
def test_simple(method=None):
automl = AutoML()
automl.add_learner(learner_name='XGBoost2D',
learner_class=XGBoost2D)
automl.add_learner(learner_name="XGBoost2D", learner_class=XGBoost2D)
automl_settings = {
"estimator_list": ['XGBoost2D'],
"task": 'classification',
"estimator_list": ["XGBoost2D"],
"task": "classification",
"log_file_name": f"test/xgboost2d_{dataset}_{method}.log",
"n_jobs": 1,
"hpo_method": method,
"log_type": "all",
"retrain_full": "budget",
"keep_search_state": True,
"time_budget": 1
"time_budget": 1,
}
from sklearn.externals._arff import ArffException
try:
X, y = fetch_openml(name=dataset, return_X_y=True)
except (ArffException, ValueError):
from sklearn.datasets import load_wine
X, y = load_wine(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
X, y, test_size=0.33, random_state=42
)
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl.estimator_list)
print(automl.search_space)
print(automl.points_to_evaluate)
config = automl.best_config.copy()
config['learner'] = automl.best_estimator
config["learner"] = automl.best_estimator
automl.trainable(config)
from flaml import tune
from flaml.automl import size
from functools import partial
analysis = tune.run(
automl.trainable, automl.search_space, metric='val_loss', mode="min",
automl.trainable,
automl.search_space,
metric="val_loss",
mode="min",
low_cost_partial_config=automl.low_cost_partial_config,
points_to_evaluate=automl.points_to_evaluate,
cat_hp_cost=automl.cat_hp_cost,
@@ -71,8 +76,10 @@ def test_simple(method=None):
min_resource=automl.min_resource,
max_resource=automl.max_resource,
time_budget_s=automl._state.time_budget,
config_constraints=[(partial(size, automl._state), '<=', automl._mem_thres)],
metric_constraints=automl.metric_constraints, num_samples=5)
config_constraints=[(partial(size, automl._state), "<=", automl._mem_thres)],
metric_constraints=automl.metric_constraints,
num_samples=5,
)
print(analysis.trials[-1])
@@ -80,6 +87,10 @@ def test_optuna():
test_simple(method="optuna")
def test_random():
test_simple(method="random")
def test_grid():
test_simple(method="grid")

View File

@@ -1,4 +1,3 @@
from flaml.searcher.blendsearch import CFO
import numpy as np
try:
@@ -8,8 +7,9 @@ try:
from ray.tune import sample
except (ImportError, AssertionError):
from flaml.tune import sample
from flaml.searcher.suggestion import OptunaSearch, Searcher, ConcurrencyLimiter
from flaml.searcher.blendsearch import BlendSearch
from flaml.searcher.blendsearch import BlendSearch, CFO, RandomSearch
def define_search_space(trial):
trial.suggest_float("a", 6, 8)
@@ -135,3 +135,14 @@ except (ImportError, AssertionError):
},
}
)
np.random.seed(7654321)
searcher = RandomSearch(
space=config,
points_to_evaluate=[{"a": 7, "b": 1e-3}, {"a": 6, "b": 3e-4}],
)
print(searcher.suggest("t1"))
print(searcher.suggest("t2"))
print(searcher.suggest("t3"))
print(searcher.suggest("t4"))
searcher.on_trial_complete({"t1"}, {})
searcher.on_trial_result({"t2"}, {})