automl fit with starting points (#141)

* add starting point in fit * add estimator best config * add test * add doc string * when there are multiple points_to_evaluate in CFO, use the best one to start local search; after that use low cost partial config as the start point; then, remove the points whose performance is worse than the converged, and start local search from the remaining ones ordered by their performance. Co-authored-by: Qingyun Wu <qingyunwu@Qingyuns-MacBook-Pro-2.local> Co-authored-by: Chi Wang <wang.chi@microsoft.com>
2026-02-04 02:24:56 -05:00 · 2021-07-31 16:39:31 -04:00
parent 15fd8adac4
commit e24265ee5d
7 changed files with 230 additions and 48 deletions
--- a/test/test_automl.py
+++ b/test/test_automl.py
@@ -154,10 +154,10 @@ class TestAutoML(unittest.TestCase):
    def test_preprocess(self):
        automl = AutoML()
        X = pd.DataFrame({
-        'f1': [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14],
-        'f2': [3., 16., 10., 12., 3., 14., 11., 12., 5., 14., 20., 16., 15., 11.,],
-        'f3': ['a', 'b', 'a', 'c', 'c', 'b', 'b', 'b', 'b', 'a', 'b', 'e', 'e', 'a'],
-        'f4': [True, True, False, True, True, False, False, False, True, True, False, False, True, True],
+            'f1': [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14],
+            'f2': [3., 16., 10., 12., 3., 14., 11., 12., 5., 14., 20., 16., 15., 11.],
+            'f3': ['a', 'b', 'a', 'c', 'c', 'b', 'b', 'b', 'b', 'a', 'b', 'e', 'e', 'a'],
+            'f4': [True, True, False, True, True, False, False, False, True, True, False, False, True, True],
        })
        y = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])

@@ -476,6 +476,53 @@ class TestAutoML(unittest.TestCase):
        print(automl_experiment.best_loss)
        print(automl_experiment.best_config_train_time)

+    def test_fit_w_starting_point(self, as_frame=True):
+        automl_experiment = AutoML()
+        automl_settings = {
+            "time_budget": 3,
+            "metric": 'accuracy',
+            "task": 'classification',
+            "log_file_name": "test/iris.log",
+            "log_training_metric": True,
+            "n_jobs": 1,
+            "model_history": True,
+        }
+        X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
+        if as_frame:
+            # test drop column
+            X_train.columns = range(X_train.shape[1])
+            X_train[X_train.shape[1]] = np.zeros(len(y_train))
+        automl_experiment.fit(X_train=X_train, y_train=y_train,
+                              **automl_settings)
+        automl_val_accuracy = 1.0 - automl_experiment.best_loss
+        print('Best ML leaner:', automl_experiment.best_estimator)
+        print('Best hyperparmeter config:', automl_experiment.best_config)
+        print('Best accuracy on validation data: {0:.4g}'.format(automl_val_accuracy))
+        print('Training duration of best run: {0:.4g} s'.format(automl_experiment.best_config_train_time))
+
+        starting_points = automl_experiment.best_config_per_estimator
+        print('starting_points', starting_points)
+        automl_settings_resume = {
+            "time_budget": 2,
+            "metric": 'accuracy',
+            "task": 'classification',
+            "log_file_name": "test/iris_resume.log",
+            "log_training_metric": True,
+            "n_jobs": 1,
+            "model_history": True,
+            "log_type": 'all',
+            "starting_points": starting_points,
+        }
+        new_automl_experiment = AutoML()
+        new_automl_experiment.fit(X_train=X_train, y_train=y_train,
+                                  **automl_settings_resume)
+
+        new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
+        print('Best ML leaner:', new_automl_experiment.best_estimator)
+        print('Best hyperparmeter config:', new_automl_experiment.best_config)
+        print('Best accuracy on validation data: {0:.4g}'.format(new_automl_val_accuracy))
+        print('Training duration of best run: {0:.4g} s'.format(new_automl_experiment.best_config_train_time))
+

 if __name__ == "__main__":
    unittest.main()
--- a/test/tune/test_tune.py
+++ b/test/tune/test_tune.py
@@ -163,7 +163,7 @@ def _test_xgboost(method='BlendSearch'):


 def test_nested():
-    from flaml import tune
+    from flaml import tune, CFO
    search_space = {
        # test nested search space
        "cost_related": {
@@ -178,6 +178,27 @@ def test_nested():
        tune.report(obj=obj)
        tune.report(obj=obj, ab=config["cost_related"]["a"] * config["b"])

+    analysis = tune.run(
+        simple_func,
+        search_alg=CFO(
+            space=search_space, metric="obj", mode="min",
+            low_cost_partial_config={
+                "cost_related": {"a": 1}
+            },
+            points_to_evaluate=[
+                {"b": .99, "cost_related": {"a": 3}},
+                {"b": .99, "cost_related": {"a": 2}},
+                {"cost_related": {"a": 8}}
+            ],
+            metric_constraints=[("ab", "<=", 4)]),
+        local_dir='logs/',
+        num_samples=-1,
+        time_budget_s=.1)
+
+    best_trial = analysis.get_best_trial()
+    logger.info(f"CFO best config: {best_trial.config}")
+    logger.info(f"CFO best result: {best_trial.last_result}")
+
    analysis = tune.run(
        simple_func,
        config=search_space,
@@ -189,11 +210,11 @@ def test_nested():
        metric_constraints=[("ab", "<=", 4)],
        local_dir='logs/',
        num_samples=-1,
-        time_budget_s=1)
+        time_budget_s=.1)

    best_trial = analysis.get_best_trial()
-    logger.info(f"Best config: {best_trial.config}")
-    logger.info(f"Best result: {best_trial.last_result}")
+    logger.info(f"BlendSearch best config: {best_trial.config}")
+    logger.info(f"BlendSearch best result: {best_trial.last_result}")


 def test_xgboost_bs():