automl fit with starting points (#141)

* add starting point in fit

* add estimator best config

* add test

* add doc string

* when there are multiple points_to_evaluate in CFO, use the best one to start local search; after that use low cost partial config as the start point; then, remove the points whose performance is worse than the converged, and start local search from the remaining ones ordered by their performance.

Co-authored-by: Qingyun Wu <qingyunwu@Qingyuns-MacBook-Pro-2.local>
Co-authored-by: Chi Wang <wang.chi@microsoft.com>
This commit is contained in:
Qingyun Wu
2021-07-31 16:39:31 -04:00
committed by GitHub
parent 15fd8adac4
commit e24265ee5d
7 changed files with 230 additions and 48 deletions

View File

@@ -154,10 +154,10 @@ class TestAutoML(unittest.TestCase):
def test_preprocess(self):
automl = AutoML()
X = pd.DataFrame({
'f1': [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14],
'f2': [3., 16., 10., 12., 3., 14., 11., 12., 5., 14., 20., 16., 15., 11.,],
'f3': ['a', 'b', 'a', 'c', 'c', 'b', 'b', 'b', 'b', 'a', 'b', 'e', 'e', 'a'],
'f4': [True, True, False, True, True, False, False, False, True, True, False, False, True, True],
'f1': [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14],
'f2': [3., 16., 10., 12., 3., 14., 11., 12., 5., 14., 20., 16., 15., 11.],
'f3': ['a', 'b', 'a', 'c', 'c', 'b', 'b', 'b', 'b', 'a', 'b', 'e', 'e', 'a'],
'f4': [True, True, False, True, True, False, False, False, True, True, False, False, True, True],
})
y = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])
@@ -476,6 +476,53 @@ class TestAutoML(unittest.TestCase):
print(automl_experiment.best_loss)
print(automl_experiment.best_config_train_time)
def test_fit_w_starting_point(self, as_frame=True):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 3,
"metric": 'accuracy',
"task": 'classification',
"log_file_name": "test/iris.log",
"log_training_metric": True,
"n_jobs": 1,
"model_history": True,
}
X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
if as_frame:
# test drop column
X_train.columns = range(X_train.shape[1])
X_train[X_train.shape[1]] = np.zeros(len(y_train))
automl_experiment.fit(X_train=X_train, y_train=y_train,
**automl_settings)
automl_val_accuracy = 1.0 - automl_experiment.best_loss
print('Best ML leaner:', automl_experiment.best_estimator)
print('Best hyperparmeter config:', automl_experiment.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(automl_val_accuracy))
print('Training duration of best run: {0:.4g} s'.format(automl_experiment.best_config_train_time))
starting_points = automl_experiment.best_config_per_estimator
print('starting_points', starting_points)
automl_settings_resume = {
"time_budget": 2,
"metric": 'accuracy',
"task": 'classification',
"log_file_name": "test/iris_resume.log",
"log_training_metric": True,
"n_jobs": 1,
"model_history": True,
"log_type": 'all',
"starting_points": starting_points,
}
new_automl_experiment = AutoML()
new_automl_experiment.fit(X_train=X_train, y_train=y_train,
**automl_settings_resume)
new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
print('Best ML leaner:', new_automl_experiment.best_estimator)
print('Best hyperparmeter config:', new_automl_experiment.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(new_automl_val_accuracy))
print('Training duration of best run: {0:.4g} s'.format(new_automl_experiment.best_config_train_time))
if __name__ == "__main__":
unittest.main()

View File

@@ -163,7 +163,7 @@ def _test_xgboost(method='BlendSearch'):
def test_nested():
from flaml import tune
from flaml import tune, CFO
search_space = {
# test nested search space
"cost_related": {
@@ -178,6 +178,27 @@ def test_nested():
tune.report(obj=obj)
tune.report(obj=obj, ab=config["cost_related"]["a"] * config["b"])
analysis = tune.run(
simple_func,
search_alg=CFO(
space=search_space, metric="obj", mode="min",
low_cost_partial_config={
"cost_related": {"a": 1}
},
points_to_evaluate=[
{"b": .99, "cost_related": {"a": 3}},
{"b": .99, "cost_related": {"a": 2}},
{"cost_related": {"a": 8}}
],
metric_constraints=[("ab", "<=", 4)]),
local_dir='logs/',
num_samples=-1,
time_budget_s=.1)
best_trial = analysis.get_best_trial()
logger.info(f"CFO best config: {best_trial.config}")
logger.info(f"CFO best result: {best_trial.last_result}")
analysis = tune.run(
simple_func,
config=search_space,
@@ -189,11 +210,11 @@ def test_nested():
metric_constraints=[("ab", "<=", 4)],
local_dir='logs/',
num_samples=-1,
time_budget_s=1)
time_budget_s=.1)
best_trial = analysis.get_best_trial()
logger.info(f"Best config: {best_trial.config}")
logger.info(f"Best result: {best_trial.last_result}")
logger.info(f"BlendSearch best config: {best_trial.config}")
logger.info(f"BlendSearch best result: {best_trial.last_result}")
def test_xgboost_bs():