Allow FLAML_sample_size in starting_points (#619)

* FLAML_sample_size * clean up * starting_points as a list * catch AssertionError * per estimator sample size * import * per estimator min_sample_size * Update flaml/automl.py Co-authored-by: Chi Wang <wang.chi@microsoft.com> * Update test/automl/test_warmstart.py Co-authored-by: Chi Wang <wang.chi@microsoft.com> * add warnings * adding more tests * fix a bug in validating starting points * improve test * revise test * revise test * documentation about custom_hp * doc and efficiency * update test Co-authored-by: Chi Wang <wang.chi@microsoft.com>
2026-02-15 21:05:15 -05:00 · 2022-07-09 16:04:46 -04:00
parent 6cb6a2a19a
commit b7846048dc
4 changed files with 203 additions and 24 deletions
--- a/test/automl/test_multiclass.py
+++ b/test/automl/test_multiclass.py
@@ -474,8 +474,11 @@ class TestMultiClass(unittest.TestCase):
        starting_points = {}
        log_file_name = automl_settings["log_file_name"]
        with training_log_reader(log_file_name) as reader:
+            sample_size = 1000
            for record in reader.records():
                config = record.config
+                config["FLAML_sample_size"] = sample_size
+                sample_size += 1000
                learner = record.learner
                if learner not in starting_points:
                    starting_points[learner] = []
--- a/test/automl/test_warmstart.py
+++ b/test/automl/test_warmstart.py
@@ -123,6 +123,102 @@ class TestWarmStart(unittest.TestCase):
        automl.fit(X_train, y_train)
        print(automl.best_config_per_estimator)

+    def test_FLAML_sample_size_in_starting_points(self):
+        from flaml.data import load_openml_dataset
+        from flaml import AutoML
+
+        X_train, X_test, y_train, y_test = load_openml_dataset(
+            dataset_id=1169, data_dir="./"
+        )
+
+        automl_settings = {
+            "time_budget": 3,
+            "task": "classification",
+        }
+
+        automl1 = AutoML()
+        print(len(y_train))
+        automl1.fit(X_train, y_train, **automl_settings)
+        print("automl1.best_config_per_estimator", automl1.best_config_per_estimator)
+
+        automl_settings["starting_points"] = automl1.best_config_per_estimator
+        automl2 = AutoML()
+        automl2.fit(X_train, y_train, **automl_settings)
+
+        automl_settings["starting_points"] = {
+            "xgboost": {
+                "n_estimators": 4,
+                "max_leaves": 4,
+                "min_child_weight": 0.26208115308159446,
+                "learning_rate": 0.25912534572860507,
+                "subsample": 0.9266743941610592,
+                "colsample_bylevel": 1.0,
+                "colsample_bytree": 1.0,
+                "reg_alpha": 0.0013933617380144255,
+                "reg_lambda": 0.18096917948292954,
+                "FLAML_sample_size": 20000,
+            },
+            "xgb_limitdepth": None,
+            "lrl1": None,
+        }
+        from flaml import tune
+
+        automl_settings["custom_hp"] = {
+            "xgboost": {
+                "n_estimators": {
+                    "domain": tune.choice([10, 20]),
+                },
+            }
+        }
+        automl2 = AutoML()
+        automl2.fit(X_train, y_train, **automl_settings)
+
+        try:
+            import ray
+
+            automl_settings["n_concurrent_trials"] = 2
+        except ImportError:
+            automl_settings["n_concurrent_trials"] = 1
+        # setting different FLAML_sample_size
+        automl_settings["starting_points"] = {
+            "catboost": {
+                "early_stopping_rounds": 10,
+                "learning_rate": 0.09999999999999996,
+                "n_estimators": 1,
+                "FLAML_sample_size": 10000,
+            },
+            "xgboost": {
+                "n_estimators": 4,
+                "max_leaves": 4,
+                "min_child_weight": 0.26208115308159446,
+                "learning_rate": 0.25912534572860507,
+                "subsample": 0.9266743941610592,
+                "colsample_bylevel": 1.0,
+                "colsample_bytree": 1.0,
+                "reg_alpha": 0.0013933617380144255,
+                "reg_lambda": 0.18096917948292954,
+                "FLAML_sample_size": 20000,
+            },
+            "xgb_limitdepth": None,
+            "lrl1": None,
+        }
+        automl3 = AutoML()
+        automl3.fit(X_train, y_train, **automl_settings)
+
+        automl_settings["sample"] = False
+        automl4 = AutoML()
+        try:
+            automl4.fit(
+                X_train,
+                y_train,
+                **automl_settings,
+            )
+            raise RuntimeError(
+                "When sample=False and starting_points contain FLAML_sample_size, AssertionError is expected but not raised."
+            )
+        except AssertionError:
+            pass
+

 if __name__ == "__main__":
    unittest.main()
--- a/test/nlp/test_default.py
+++ b/test/nlp/test_default.py
@@ -57,15 +57,25 @@ def test_starting_point_not_in_search_space():
            "learning_rate": {
                "domain": tune.choice([1e-4, 1e-5]),
            },
+            "per_device_train_batch_size": {
+                "domain": 2,
+            },
        }
    }
    automl_settings["starting_points"] = "data:test/nlp/default/"
    del automl_settings["fit_kwargs_by_estimator"][this_estimator_name]["model_path"]

    automl.fit(X_train, y_train, **automl_settings)
-    assert (
-        len(automl._search_states[this_estimator_name].init_config) == 0
-    )  # check that init config is not updated, but search space is updated
+    assert len(automl._search_states[this_estimator_name].init_config) == len(
+        automl._search_states[this_estimator_name]._search_space_domain
+    ) - len(automl_settings["custom_hp"][this_estimator_name]), (
+        "The search space is updated with the custom_hp on {} hyperparameters of "
+        "the specified estimator without an initial value. Thus a valid init config "
+        "should only contain the cardinality of the search space minus {}".format(
+            len(automl_settings["custom_hp"][this_estimator_name]),
+            len(automl_settings["custom_hp"][this_estimator_name]),
+        )
+    )
    assert (
        automl._search_states[this_estimator_name].search_space["model_path"]
        == "albert-base-v2"