Allow FLAML_sample_size in starting_points (#619)

* FLAML_sample_size

* clean up

* starting_points as a list

* catch AssertionError

* per estimator sample size

* import

* per estimator min_sample_size

* Update flaml/automl.py

Co-authored-by: Chi Wang <wang.chi@microsoft.com>

* Update test/automl/test_warmstart.py

Co-authored-by: Chi Wang <wang.chi@microsoft.com>

* add warnings

* adding more tests

* fix a bug in validating starting points

* improve test

* revise test

* revise test

* documentation about custom_hp

* doc and efficiency

* update test

Co-authored-by: Chi Wang <wang.chi@microsoft.com>
This commit is contained in:
Qingyun Wu
2022-07-09 16:04:46 -04:00
committed by GitHub
parent 6cb6a2a19a
commit b7846048dc
4 changed files with 203 additions and 24 deletions

View File

@@ -474,8 +474,11 @@ class TestMultiClass(unittest.TestCase):
starting_points = {}
log_file_name = automl_settings["log_file_name"]
with training_log_reader(log_file_name) as reader:
sample_size = 1000
for record in reader.records():
config = record.config
config["FLAML_sample_size"] = sample_size
sample_size += 1000
learner = record.learner
if learner not in starting_points:
starting_points[learner] = []

View File

@@ -123,6 +123,102 @@ class TestWarmStart(unittest.TestCase):
automl.fit(X_train, y_train)
print(automl.best_config_per_estimator)
def test_FLAML_sample_size_in_starting_points(self):
from flaml.data import load_openml_dataset
from flaml import AutoML
X_train, X_test, y_train, y_test = load_openml_dataset(
dataset_id=1169, data_dir="./"
)
automl_settings = {
"time_budget": 3,
"task": "classification",
}
automl1 = AutoML()
print(len(y_train))
automl1.fit(X_train, y_train, **automl_settings)
print("automl1.best_config_per_estimator", automl1.best_config_per_estimator)
automl_settings["starting_points"] = automl1.best_config_per_estimator
automl2 = AutoML()
automl2.fit(X_train, y_train, **automl_settings)
automl_settings["starting_points"] = {
"xgboost": {
"n_estimators": 4,
"max_leaves": 4,
"min_child_weight": 0.26208115308159446,
"learning_rate": 0.25912534572860507,
"subsample": 0.9266743941610592,
"colsample_bylevel": 1.0,
"colsample_bytree": 1.0,
"reg_alpha": 0.0013933617380144255,
"reg_lambda": 0.18096917948292954,
"FLAML_sample_size": 20000,
},
"xgb_limitdepth": None,
"lrl1": None,
}
from flaml import tune
automl_settings["custom_hp"] = {
"xgboost": {
"n_estimators": {
"domain": tune.choice([10, 20]),
},
}
}
automl2 = AutoML()
automl2.fit(X_train, y_train, **automl_settings)
try:
import ray
automl_settings["n_concurrent_trials"] = 2
except ImportError:
automl_settings["n_concurrent_trials"] = 1
# setting different FLAML_sample_size
automl_settings["starting_points"] = {
"catboost": {
"early_stopping_rounds": 10,
"learning_rate": 0.09999999999999996,
"n_estimators": 1,
"FLAML_sample_size": 10000,
},
"xgboost": {
"n_estimators": 4,
"max_leaves": 4,
"min_child_weight": 0.26208115308159446,
"learning_rate": 0.25912534572860507,
"subsample": 0.9266743941610592,
"colsample_bylevel": 1.0,
"colsample_bytree": 1.0,
"reg_alpha": 0.0013933617380144255,
"reg_lambda": 0.18096917948292954,
"FLAML_sample_size": 20000,
},
"xgb_limitdepth": None,
"lrl1": None,
}
automl3 = AutoML()
automl3.fit(X_train, y_train, **automl_settings)
automl_settings["sample"] = False
automl4 = AutoML()
try:
automl4.fit(
X_train,
y_train,
**automl_settings,
)
raise RuntimeError(
"When sample=False and starting_points contain FLAML_sample_size, AssertionError is expected but not raised."
)
except AssertionError:
pass
if __name__ == "__main__":
unittest.main()

View File

@@ -57,15 +57,25 @@ def test_starting_point_not_in_search_space():
"learning_rate": {
"domain": tune.choice([1e-4, 1e-5]),
},
"per_device_train_batch_size": {
"domain": 2,
},
}
}
automl_settings["starting_points"] = "data:test/nlp/default/"
del automl_settings["fit_kwargs_by_estimator"][this_estimator_name]["model_path"]
automl.fit(X_train, y_train, **automl_settings)
assert (
len(automl._search_states[this_estimator_name].init_config) == 0
) # check that init config is not updated, but search space is updated
assert len(automl._search_states[this_estimator_name].init_config) == len(
automl._search_states[this_estimator_name]._search_space_domain
) - len(automl_settings["custom_hp"][this_estimator_name]), (
"The search space is updated with the custom_hp on {} hyperparameters of "
"the specified estimator without an initial value. Thus a valid init config "
"should only contain the cardinality of the search space minus {}".format(
len(automl_settings["custom_hp"][this_estimator_name]),
len(automl_settings["custom_hp"][this_estimator_name]),
)
)
assert (
automl._search_states[this_estimator_name].search_space["model_path"]
== "albert-base-v2"