Merge branch 'microsoft:main' into main

2026-04-20 03:02:16 -04:00 · 2022-06-13 17:28:38 -04:00
parent cc3e9ae968 5de3f54fd9
commit 2478a3fca1
8 changed files with 59 additions and 30 deletions
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -89,7 +89,12 @@ class SearchState:
            renamed_type = list(
                inspect.signature(domain_one_dim.is_valid).parameters.values()
            )[0].annotation
-            type_match = renamed_type == Any or isinstance(value_one_dim, renamed_type)
+            type_match = (
+                renamed_type == Any
+                or isinstance(value_one_dim, renamed_type)
+                or isinstance(value_one_dim, int)
+                and renamed_type is float
+            )
            if not (type_match and domain_one_dim.is_valid(value_one_dim)):
                return False
        elif value_one_dim != domain_one_dim:
@@ -380,6 +385,15 @@ class AutoMLState:
        tune.report(**result)
        return result

+    def sanitize(self, config: dict) -> dict:
+        """Make a config ready for passing to estimator."""
+        config = config.get("ml", config).copy()
+        if "FLAML_sample_size" in config:
+            del config["FLAML_sample_size"]
+        if "learner" in config:
+            del config["learner"]
+        return config
+
    def _train_with_config(
        self,
        estimator,
@@ -390,11 +404,7 @@ class AutoMLState:
            sample_size = config_w_resource.get(
                "FLAML_sample_size", len(self.y_train_all)
            )
-        config = config_w_resource.get("ml", config_w_resource).copy()
-        if "FLAML_sample_size" in config:
-            del config["FLAML_sample_size"]
-        if "learner" in config:
-            del config["learner"]
+        config = self.sanitize(config_w_resource)

        this_estimator_kwargs = self.fit_kwargs_by_estimator.get(
            estimator
@@ -1498,6 +1508,10 @@ class AutoML(BaseEstimator):
    ):
        """Retrain from log file.

+        This function is intended to retrain the logged configurations.
+        NOTE: In some rare case, the last config is early stopped to meet time_budget and it's the best config.
+        But the logged config's ITER_HP (e.g., n_estimators) is not reduced.
+
        Args:
            log_file_name: A string of the log file name.
            X_train: A numpy array or dataframe of training data in shape n*m.
@@ -3171,6 +3185,7 @@ class AutoML(BaseEstimator):
        # Add a checkpoint for the current best config to the log.
        if self._training_log:
            self._training_log.checkpoint()
+        self._state.time_from_start = time.time() - self._start_time_flag
        if self._best_estimator:
            self._selected = self._search_states[self._best_estimator]
            self.modelcount = sum(
@@ -3194,7 +3209,7 @@ class AutoML(BaseEstimator):
                        x[1].learner_class(
                            task=self._state.task,
                            n_jobs=self._state.n_jobs,
-                            **x[1].best_config,
+                            **self._state.sanitize(x[1].best_config),
                        ),
                    )
                    for x in search_states[:2]
@@ -3205,13 +3220,15 @@ class AutoML(BaseEstimator):
                        x[1].learner_class(
                            task=self._state.task,
                            n_jobs=self._state.n_jobs,
-                            **x[1].best_config,
+                            **self._state.sanitize(x[1].best_config),
                        ),
                    )
                    for x in search_states[2:]
                    if x[1].best_loss < 4 * self._selected.best_loss
                ]
-                logger.info(estimators)
+                logger.info(
+                    [(estimator[0], estimator[1].params) for estimator in estimators]
+                )
            if len(estimators) > 1:
                if self._state.task in CLASSIFICATION:
                    from sklearn.ensemble import StackingClassifier as Stacker
--- a/flaml/model.py
+++ b/flaml/model.py
@@ -948,7 +948,7 @@ class LGBMEstimator(BaseEstimator):
                "low_cost_init_value": 4,
            },
            "min_child_samples": {
-                "domain": tune.lograndint(lower=2, upper=2 ** 7 + 1),
+                "domain": tune.lograndint(lower=2, upper=2**7 + 1),
                "init_value": 20,
            },
            "learning_rate": {
@@ -1047,7 +1047,6 @@ class LGBMEstimator(BaseEstimator):
                self.params[self.ITER_HP] = 1
                self._t1 = self._fit(X_train, y_train, **kwargs)
                if budget is not None and self._t1 >= budget or n_iter == 1:
-                    # self.params[self.ITER_HP] = n_iter
                    return self._t1
                mem1 = psutil.virtual_memory().available if psutil is not None else 1
                self._mem1 = mem0 - mem1
@@ -1168,7 +1167,7 @@ class XGBoostEstimator(SKLearnEstimator):
            },
            "min_child_weight": {
                "domain": tune.loguniform(lower=0.001, upper=128),
-                "init_value": 1,
+                "init_value": 1.0,
            },
            "learning_rate": {
                "domain": tune.loguniform(lower=1 / 1024, upper=1.0),
@@ -1797,17 +1796,17 @@ class ARIMA(Prophet):
    def search_space(cls, **params):
        space = {
            "p": {
-                "domain": tune.quniform(lower=0, upper=10, q=1),
+                "domain": tune.qrandint(lower=0, upper=10, q=1),
                "init_value": 2,
                "low_cost_init_value": 0,
            },
            "d": {
-                "domain": tune.quniform(lower=0, upper=10, q=1),
+                "domain": tune.qrandint(lower=0, upper=10, q=1),
                "init_value": 2,
                "low_cost_init_value": 0,
            },
            "q": {
-                "domain": tune.quniform(lower=0, upper=10, q=1),
+                "domain": tune.qrandint(lower=0, upper=10, q=1),
                "init_value": 1,
                "low_cost_init_value": 0,
            },
@@ -1884,32 +1883,32 @@ class SARIMAX(ARIMA):
    def search_space(cls, **params):
        space = {
            "p": {
-                "domain": tune.quniform(lower=0, upper=10, q=1),
+                "domain": tune.qrandint(lower=0, upper=10, q=1),
                "init_value": 2,
                "low_cost_init_value": 0,
            },
            "d": {
-                "domain": tune.quniform(lower=0, upper=10, q=1),
+                "domain": tune.qrandint(lower=0, upper=10, q=1),
                "init_value": 2,
                "low_cost_init_value": 0,
            },
            "q": {
-                "domain": tune.quniform(lower=0, upper=10, q=1),
+                "domain": tune.qrandint(lower=0, upper=10, q=1),
                "init_value": 1,
                "low_cost_init_value": 0,
            },
            "P": {
-                "domain": tune.quniform(lower=0, upper=10, q=1),
+                "domain": tune.qrandint(lower=0, upper=10, q=1),
                "init_value": 1,
                "low_cost_init_value": 0,
            },
            "D": {
-                "domain": tune.quniform(lower=0, upper=10, q=1),
+                "domain": tune.qrandint(lower=0, upper=10, q=1),
                "init_value": 1,
                "low_cost_init_value": 0,
            },
            "Q": {
-                "domain": tune.quniform(lower=0, upper=10, q=1),
+                "domain": tune.qrandint(lower=0, upper=10, q=1),
                "init_value": 1,
                "low_cost_init_value": 0,
            },
--- a/flaml/version.py
+++ b/flaml/version.py
@@ -1 +1 @@
-__version__ = "1.0.5"
+__version__ = "1.0.7"
--- a/notebook/automl_time_series_forecast.ipynb
+++ b/notebook/automl_time_series_forecast.ipynb
@@ -131,7 +131,8 @@
    }
   ],
   "source": [
-    "!pip install flaml[notebook,ts_forecast]"
+    "%pip install flaml[notebook,ts_forecast]\n",
+    "# avoid version 1.0.2 to 1.0.5 for this notebook due to a bug for arima and sarimax's init config"
   ]
  },
  {
--- a/test/automl/test_classification.py
+++ b/test/automl/test_classification.py
@@ -256,6 +256,7 @@ class TestClassification(unittest.TestCase):
                time_budget=10,
                task="classification",
                n_concurrent_trials=2,
+                ensemble=True,
            )
        except ImportError:
            return
--- a/test/automl/test_notebook_example.py
+++ b/test/automl/test_notebook_example.py
@@ -1,6 +1,6 @@
 import sys
 from openml.exceptions import OpenMLServerException
-from requests.exceptions import ChunkedEncodingError
+from requests.exceptions import ChunkedEncodingError, SSLError


 def test_automl(budget=5, dataset_format="dataframe", hpo_method=None):
@@ -23,6 +23,7 @@ def test_automl(budget=5, dataset_format="dataframe", hpo_method=None):
        OpenMLServerException,
        ChunkedEncodingError,
        urllib3.exceptions.ReadTimeoutError,
+        SSLError,
    ) as e:
        print(e)
        return
@@ -110,7 +111,7 @@ def test_mlflow():
        X_train, X_test, y_train, y_test = load_openml_task(
            task_id=7592, data_dir="test/"
        )
-    except (OpenMLServerException, ChunkedEncodingError) as e:
+    except (OpenMLServerException, ChunkedEncodingError, SSLError) as e:
        print(e)
        return
    """ import AutoML class from flaml package """
--- a/test/automl/test_regression.py
+++ b/test/automl/test_regression.py
@@ -56,6 +56,7 @@ class TestRegression(unittest.TestCase):
        y_pred = automl.predict(X_train)
        print(y_pred)
        print(automl.model.estimator)
+        n_iter = automl.model.estimator.get_params("n_estimators")
        print(automl.config_history)
        print(automl.best_model_for_estimator("xgboost"))
        print(automl.best_iteration)
@@ -86,7 +87,11 @@ class TestRegression(unittest.TestCase):
        )
        print(automl.model.estimator)
        y_pred2 = automl.predict(X_train)
-        assert (y_pred == y_pred2).all()
+        # In some rare case, the last config is early stopped and it's the best config. But the logged config's n_estimator is not reduced.
+        assert (
+            n_iter != automl.model.estimator.get_params("n_estimator")
+            or (y_pred == y_pred2).all()
+        )

    def test_sparse_matrix_regression(self):
        X_train = scipy.sparse.random(300, 900, density=0.0001)
--- a/test/test_autovw.py
+++ b/test/test_autovw.py
@@ -12,6 +12,7 @@ from flaml import AutoVW
 import string
 import os
 import openml
+from requests.exceptions import SSLError

 VW_DS_DIR = "test/data/"
 NS_LIST = list(string.ascii_lowercase) + list(string.ascii_uppercase)
@@ -96,10 +97,14 @@ def shuffle_data(X, y, seed):
 def get_oml_to_vw(did, max_ns_num, ds_dir=VW_DS_DIR):
    success = False
    print("-----getting oml dataset-------", did)
-    ds = openml.datasets.get_dataset(did)
-    target_attribute = ds.default_target_attribute
-    # if target_attribute is None and did in OML_target_attribute_dict:
-    #     target_attribute = OML_target_attribute_dict[did]
+    try:
+        ds = openml.datasets.get_dataset(did)
+        target_attribute = ds.default_target_attribute
+        # if target_attribute is None and did in OML_target_attribute_dict:
+        #     target_attribute = OML_target_attribute_dict[did]
+    except (SSLError) as e:
+        print(e)
+        return

    print("target=ds.default_target_attribute", target_attribute)
    data = ds.get_data(target=target_attribute, dataset_format="array")