mirror of
https://github.com/microsoft/autogen.git
synced 2026-02-04 09:25:07 -05:00
warning -> info for low cost partial config (#231)
* warning -> info for low cost partial config #195, #110 * when n_estimators < 0, use trained_estimator's * log debug info * test random seed * remove "objective"; avoid ZeroDivisionError * hp config to estimator params * check type of searcher * default n_jobs * try import * Update searchalgo_auto.py * CLASSIFICATION * auto_augment flag * min_sample_size * make catboost optional
This commit is contained in:
@@ -2,7 +2,12 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
from sklearn.datasets import load_boston, load_iris, load_wine, load_breast_cancer
|
||||
from sklearn.datasets import (
|
||||
fetch_california_housing,
|
||||
load_iris,
|
||||
load_wine,
|
||||
load_breast_cancer,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
@@ -17,59 +22,37 @@ from flaml.training_log import training_log_reader
|
||||
|
||||
|
||||
class MyRegularizedGreedyForest(SKLearnEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
task="binary",
|
||||
n_jobs=1,
|
||||
max_leaf=4,
|
||||
n_iter=1,
|
||||
n_tree_search=1,
|
||||
opt_interval=1,
|
||||
learning_rate=1.0,
|
||||
min_samples_leaf=1,
|
||||
**params
|
||||
):
|
||||
def __init__(self, task="binary", **config):
|
||||
|
||||
super().__init__(task, **params)
|
||||
super().__init__(task, **config)
|
||||
|
||||
if "regression" in task:
|
||||
self.estimator_class = RGFRegressor
|
||||
else:
|
||||
if task in ("binary", "multi"):
|
||||
self.estimator_class = RGFClassifier
|
||||
|
||||
# round integer hyperparameters
|
||||
self.params = {
|
||||
"n_jobs": n_jobs,
|
||||
"max_leaf": int(round(max_leaf)),
|
||||
"n_iter": int(round(n_iter)),
|
||||
"n_tree_search": int(round(n_tree_search)),
|
||||
"opt_interval": int(round(opt_interval)),
|
||||
"learning_rate": learning_rate,
|
||||
"min_samples_leaf": int(round(min_samples_leaf)),
|
||||
}
|
||||
else:
|
||||
self.estimator_class = RGFRegressor
|
||||
|
||||
@classmethod
|
||||
def search_space(cls, data_size, task):
|
||||
space = {
|
||||
"max_leaf": {
|
||||
"domain": tune.qloguniform(lower=4, upper=data_size, q=1),
|
||||
"domain": tune.lograndint(lower=4, upper=data_size),
|
||||
"init_value": 4,
|
||||
},
|
||||
"n_iter": {
|
||||
"domain": tune.qloguniform(lower=1, upper=data_size, q=1),
|
||||
"domain": tune.lograndint(lower=1, upper=data_size),
|
||||
"init_value": 1,
|
||||
},
|
||||
"n_tree_search": {
|
||||
"domain": tune.qloguniform(lower=1, upper=32768, q=1),
|
||||
"domain": tune.lograndint(lower=1, upper=32768),
|
||||
"init_value": 1,
|
||||
},
|
||||
"opt_interval": {
|
||||
"domain": tune.qloguniform(lower=1, upper=10000, q=1),
|
||||
"domain": tune.lograndint(lower=1, upper=10000),
|
||||
"init_value": 100,
|
||||
},
|
||||
"learning_rate": {"domain": tune.loguniform(lower=0.01, upper=20.0)},
|
||||
"min_samples_leaf": {
|
||||
"domain": tune.qloguniform(lower=1, upper=20, q=1),
|
||||
"domain": tune.lograndint(lower=1, upper=20),
|
||||
"init_value": 20,
|
||||
},
|
||||
}
|
||||
@@ -97,15 +80,15 @@ def logregobj(preds, dtrain):
|
||||
class MyXGB1(XGBoostEstimator):
|
||||
"""XGBoostEstimator with logregobj as the objective function"""
|
||||
|
||||
def __init__(self, **params):
|
||||
super().__init__(objective=logregobj, **params)
|
||||
def __init__(self, **config):
|
||||
super().__init__(objective=logregobj, **config)
|
||||
|
||||
|
||||
class MyXGB2(XGBoostEstimator):
|
||||
"""XGBoostEstimator with 'reg:squarederror' as the objective function"""
|
||||
|
||||
def __init__(self, **params):
|
||||
super().__init__(objective="reg:squarederror", **params)
|
||||
def __init__(self, **config):
|
||||
super().__init__(objective="reg:squarederror", **config)
|
||||
|
||||
|
||||
class MyLargeLGBM(LGBMEstimator):
|
||||
@@ -266,7 +249,7 @@ class TestAutoML(unittest.TestCase):
|
||||
"n_splits": 3,
|
||||
"metric": "accuracy",
|
||||
"log_training_metric": True,
|
||||
"verbose": 1,
|
||||
"verbose": 4,
|
||||
"ensemble": True,
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
@@ -281,7 +264,7 @@ class TestAutoML(unittest.TestCase):
|
||||
"n_splits": 3,
|
||||
"metric": "accuracy",
|
||||
"log_training_metric": True,
|
||||
"verbose": 1,
|
||||
"verbose": 4,
|
||||
"ensemble": True,
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
@@ -296,7 +279,7 @@ class TestAutoML(unittest.TestCase):
|
||||
"n_splits": 3,
|
||||
"metric": "accuracy",
|
||||
"log_training_metric": True,
|
||||
"verbose": 1,
|
||||
"verbose": 4,
|
||||
"ensemble": True,
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
@@ -311,7 +294,7 @@ class TestAutoML(unittest.TestCase):
|
||||
"n_splits": 3,
|
||||
"metric": "accuracy",
|
||||
"log_training_metric": True,
|
||||
"verbose": 1,
|
||||
"verbose": 4,
|
||||
"ensemble": True,
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
@@ -525,7 +508,7 @@ class TestAutoML(unittest.TestCase):
|
||||
"n_jobs": 1,
|
||||
"model_history": True,
|
||||
}
|
||||
X_train, y_train = load_boston(return_X_y=True)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||
n = int(len(y_train) * 9 // 10)
|
||||
automl_experiment.fit(
|
||||
X_train=X_train[:n],
|
||||
@@ -648,7 +631,7 @@ class TestAutoML(unittest.TestCase):
|
||||
"n_concurrent_trials": 2,
|
||||
"hpo_method": hpo_method,
|
||||
}
|
||||
X_train, y_train = load_boston(return_X_y=True)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||
try:
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
@@ -861,8 +844,8 @@ class TestAutoML(unittest.TestCase):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 3,
|
||||
"metric": 'accuracy',
|
||||
"task": 'classification',
|
||||
"metric": "accuracy",
|
||||
"task": "classification",
|
||||
"log_file_name": "test/iris.log",
|
||||
"log_training_metric": True,
|
||||
"n_jobs": 1,
|
||||
@@ -873,16 +856,19 @@ class TestAutoML(unittest.TestCase):
|
||||
# test drop column
|
||||
X_train.columns = range(X_train.shape[1])
|
||||
X_train[X_train.shape[1]] = np.zeros(len(y_train))
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
||||
**automl_settings)
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
automl_val_accuracy = 1.0 - automl_experiment.best_loss
|
||||
print('Best ML leaner:', automl_experiment.best_estimator)
|
||||
print('Best hyperparmeter config:', automl_experiment.best_config)
|
||||
print('Best accuracy on validation data: {0:.4g}'.format(automl_val_accuracy))
|
||||
print('Training duration of best run: {0:.4g} s'.format(automl_experiment.best_config_train_time))
|
||||
print("Best ML leaner:", automl_experiment.best_estimator)
|
||||
print("Best hyperparmeter config:", automl_experiment.best_config)
|
||||
print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
|
||||
print(
|
||||
"Training duration of best run: {0:.4g} s".format(
|
||||
automl_experiment.best_config_train_time
|
||||
)
|
||||
)
|
||||
|
||||
starting_points = {}
|
||||
log_file_name = automl_settings['log_file_name']
|
||||
log_file_name = automl_settings["log_file_name"]
|
||||
with training_log_reader(log_file_name) as reader:
|
||||
for record in reader.records():
|
||||
config = record.config
|
||||
@@ -893,25 +879,28 @@ class TestAutoML(unittest.TestCase):
|
||||
max_iter = sum([len(s) for k, s in starting_points.items()])
|
||||
automl_settings_resume = {
|
||||
"time_budget": 2,
|
||||
"metric": 'accuracy',
|
||||
"task": 'classification',
|
||||
"metric": "accuracy",
|
||||
"task": "classification",
|
||||
"log_file_name": "test/iris_resume_all.log",
|
||||
"log_training_metric": True,
|
||||
"n_jobs": 1,
|
||||
"max_iter": max_iter,
|
||||
"model_history": True,
|
||||
"log_type": 'all',
|
||||
"log_type": "all",
|
||||
"starting_points": starting_points,
|
||||
"append_log": True,
|
||||
}
|
||||
new_automl_experiment = AutoML()
|
||||
new_automl_experiment.fit(X_train=X_train, y_train=y_train,
|
||||
**automl_settings_resume)
|
||||
new_automl_experiment.fit(
|
||||
X_train=X_train, y_train=y_train, **automl_settings_resume
|
||||
)
|
||||
|
||||
new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
|
||||
# print('Best ML leaner:', new_automl_experiment.best_estimator)
|
||||
# print('Best hyperparmeter config:', new_automl_experiment.best_config)
|
||||
print('Best accuracy on validation data: {0:.4g}'.format(new_automl_val_accuracy))
|
||||
print(
|
||||
"Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
|
||||
)
|
||||
# print('Training duration of best run: {0:.4g} s'.format(new_automl_experiment.best_config_train_time))
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from flaml.tune.space import unflatten_hierarchical
|
||||
from flaml import AutoML
|
||||
from sklearn.datasets import load_boston
|
||||
from sklearn.datasets import fetch_california_housing
|
||||
import os
|
||||
import unittest
|
||||
import logging
|
||||
@@ -9,7 +9,6 @@ import io
|
||||
|
||||
|
||||
class TestLogging(unittest.TestCase):
|
||||
|
||||
def test_logging_level(self):
|
||||
|
||||
from flaml import logger, logger_formatter
|
||||
@@ -30,8 +29,8 @@ class TestLogging(unittest.TestCase):
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 1,
|
||||
"metric": 'rmse',
|
||||
"task": 'regression',
|
||||
"metric": "rmse",
|
||||
"task": "regression",
|
||||
"log_file_name": training_log,
|
||||
"log_training_metric": True,
|
||||
"n_jobs": 1,
|
||||
@@ -39,35 +38,42 @@ class TestLogging(unittest.TestCase):
|
||||
"keep_search_state": True,
|
||||
"learner_selector": "roundrobin",
|
||||
}
|
||||
X_train, y_train = load_boston(return_X_y=True)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||
n = len(y_train) >> 1
|
||||
print(automl.model, automl.classes_, automl.predict(X_train))
|
||||
automl.fit(X_train=X_train[:n], y_train=y_train[:n],
|
||||
X_val=X_train[n:], y_val=y_train[n:],
|
||||
**automl_settings)
|
||||
automl.fit(
|
||||
X_train=X_train[:n],
|
||||
y_train=y_train[:n],
|
||||
X_val=X_train[n:],
|
||||
y_val=y_train[n:],
|
||||
**automl_settings
|
||||
)
|
||||
logger.info(automl.search_space)
|
||||
logger.info(automl.low_cost_partial_config)
|
||||
logger.info(automl.points_to_evaluate)
|
||||
logger.info(automl.cat_hp_cost)
|
||||
import optuna as ot
|
||||
|
||||
study = ot.create_study()
|
||||
from flaml.tune.space import define_by_run_func, add_cost_to_space
|
||||
|
||||
sample = define_by_run_func(study.ask(), automl.search_space)
|
||||
logger.info(sample)
|
||||
logger.info(unflatten_hierarchical(sample, automl.search_space))
|
||||
add_cost_to_space(
|
||||
automl.search_space, automl.low_cost_partial_config,
|
||||
automl.cat_hp_cost
|
||||
automl.search_space, automl.low_cost_partial_config, automl.cat_hp_cost
|
||||
)
|
||||
logger.info(automl.search_space["ml"].categories)
|
||||
config = automl.best_config.copy()
|
||||
config['learner'] = automl.best_estimator
|
||||
config["learner"] = automl.best_estimator
|
||||
automl.trainable({"ml": config})
|
||||
from flaml import tune, BlendSearch
|
||||
from flaml.automl import size
|
||||
from functools import partial
|
||||
|
||||
search_alg = BlendSearch(
|
||||
metric='val_loss', mode='min',
|
||||
metric="val_loss",
|
||||
mode="min",
|
||||
space=automl.search_space,
|
||||
low_cost_partial_config=automl.low_cost_partial_config,
|
||||
points_to_evaluate=automl.points_to_evaluate,
|
||||
@@ -75,19 +81,25 @@ class TestLogging(unittest.TestCase):
|
||||
prune_attr=automl.prune_attr,
|
||||
min_resource=automl.min_resource,
|
||||
max_resource=automl.max_resource,
|
||||
config_constraints=[(partial(size, automl._state), '<=', automl._mem_thres)],
|
||||
metric_constraints=automl.metric_constraints)
|
||||
config_constraints=[
|
||||
(partial(size, automl._state), "<=", automl._mem_thres)
|
||||
],
|
||||
metric_constraints=automl.metric_constraints,
|
||||
)
|
||||
analysis = tune.run(
|
||||
automl.trainable, search_alg=search_alg, # verbose=2,
|
||||
time_budget_s=1, num_samples=-1)
|
||||
print(min(trial.last_result["val_loss"]
|
||||
for trial in analysis.trials))
|
||||
config = analysis.trials[-1].last_result['config']['ml']
|
||||
automl._state._train_with_config(config['learner'], config)
|
||||
automl.trainable,
|
||||
search_alg=search_alg, # verbose=2,
|
||||
time_budget_s=1,
|
||||
num_samples=-1,
|
||||
)
|
||||
print(min(trial.last_result["val_loss"] for trial in analysis.trials))
|
||||
config = analysis.trials[-1].last_result["config"]["ml"]
|
||||
automl._state._train_with_config(config["learner"], config)
|
||||
# Check if the log buffer is populated.
|
||||
self.assertTrue(len(buf.getvalue()) > 0)
|
||||
|
||||
import pickle
|
||||
with open('automl.pkl', 'wb') as f:
|
||||
|
||||
with open("automl.pkl", "wb") as f:
|
||||
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
|
||||
print(automl.__version__)
|
||||
|
||||
@@ -2,15 +2,14 @@ import os
|
||||
import unittest
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from sklearn.datasets import load_boston
|
||||
from sklearn.datasets import fetch_california_housing
|
||||
|
||||
from flaml import AutoML
|
||||
from flaml.training_log import training_log_reader
|
||||
|
||||
|
||||
class TestTrainingLog(unittest.TestCase):
|
||||
|
||||
def test_training_log(self, path='test_training_log.log'):
|
||||
def test_training_log(self, path="test_training_log.log"):
|
||||
|
||||
with TemporaryDirectory() as d:
|
||||
filename = os.path.join(d, path)
|
||||
@@ -19,8 +18,8 @@ class TestTrainingLog(unittest.TestCase):
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 1,
|
||||
"metric": 'mse',
|
||||
"task": 'regression',
|
||||
"metric": "mse",
|
||||
"task": "regression",
|
||||
"log_file_name": filename,
|
||||
"log_training_metric": True,
|
||||
"mem_thres": 1024 * 1024,
|
||||
@@ -31,10 +30,9 @@ class TestTrainingLog(unittest.TestCase):
|
||||
"ensemble": True,
|
||||
"keep_search_state": True,
|
||||
}
|
||||
X_train, y_train = load_boston(return_X_y=True)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
automl._state._train_with_config(
|
||||
automl.best_estimator, automl.best_config)
|
||||
automl._state._train_with_config(automl.best_estimator, automl.best_config)
|
||||
|
||||
# Check if the training log file is populated.
|
||||
self.assertTrue(os.path.exists(filename))
|
||||
@@ -49,11 +47,11 @@ class TestTrainingLog(unittest.TestCase):
|
||||
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
automl._selected.update(None, 0)
|
||||
automl = AutoML()
|
||||
automl.fit(X_train=X_train, y_train=y_train, max_iter=0)
|
||||
automl.fit(X_train=X_train, y_train=y_train, max_iter=0, task="regression")
|
||||
|
||||
def test_illfilename(self):
|
||||
try:
|
||||
self.test_training_log('/')
|
||||
self.test_training_log("/")
|
||||
except IsADirectoryError:
|
||||
print("IsADirectoryError happens as expected in linux.")
|
||||
except PermissionError:
|
||||
|
||||
@@ -72,8 +72,9 @@ except (ImportError, AssertionError):
|
||||
searcher = BlendSearch(
|
||||
metric="m", global_search_alg=searcher, metric_constraints=[("c", "<", 1)]
|
||||
)
|
||||
searcher.set_search_properties(metric="m2", config=config)
|
||||
searcher.set_search_properties(config={"time_budget_s": 0})
|
||||
searcher.set_search_properties(
|
||||
metric="m2", config=config, setting={"time_budget_s": 0}
|
||||
)
|
||||
c = searcher.suggest("t1")
|
||||
searcher.on_trial_complete("t1", {"config": c}, True)
|
||||
c = searcher.suggest("t2")
|
||||
@@ -146,3 +147,11 @@ except (ImportError, AssertionError):
|
||||
print(searcher.suggest("t4"))
|
||||
searcher.on_trial_complete({"t1"}, {})
|
||||
searcher.on_trial_result({"t2"}, {})
|
||||
np.random.seed(654321)
|
||||
searcher = RandomSearch(
|
||||
space=config,
|
||||
points_to_evaluate=[{"a": 7, "b": 1e-3}, {"a": 6, "b": 3e-4}],
|
||||
)
|
||||
print(searcher.suggest("t1"))
|
||||
print(searcher.suggest("t2"))
|
||||
print(searcher.suggest("t3"))
|
||||
|
||||
Reference in New Issue
Block a user