diff --git a/flaml/automl.py b/flaml/automl.py index 9955e99fa..cde608a94 100644 --- a/flaml/automl.py +++ b/flaml/automl.py @@ -462,7 +462,7 @@ class AutoML(BaseEstimator): def custom_metric( X_val, y_val, estimator, labels, X_train, y_train, weight_val=None, weight_train=None, - **args, + *args, ): from sklearn.metrics import log_loss import time @@ -586,6 +586,20 @@ class AutoML(BaseEstimator): in separate processes. This can be used to prevent OOM for large datasets, but will incur more overhead in time. Only use it if you run into OOM failures. + metric_constraints: list, default=[] | The list of metric constraints. + Each element in this list is a 3-tuple, which shall be expressed + in the following format: the first element of the 3-tuple is the name of the + metric, the second element is the inequality sign chosen from ">=" and "<=", + and the third element is the constraint value. E.g., `('precision', '>=', 0.9)`. + Note that all the metric names in metric_constraints need to be reported via + the metrics_to_log dictionary returned by a customized metric function. + The customized metric function shall be provided via the `metric` key word + argument of the fit() function or the automl constructor. + Find examples in this [test](https://github.com/microsoft/FLAML/tree/main/test/automl/test_constraints.py). + If `pred_time_limit` is provided as one of keyword arguments to fit() function or + the automl constructor, flaml will automatically (and under the hood) + add it as an additional element in the metric_constraints. Essentially 'pred_time_limit' + specifies a constraint about the prediction latency constraint in seconds. """ self._track_iter = 0 @@ -623,6 +637,7 @@ class AutoML(BaseEstimator): settings["append_log"] = settings.get("append_log", False) settings["min_sample_size"] = settings.get("min_sample_size", MIN_SAMPLE_TRAIN) settings["use_ray"] = settings.get("use_ray", False) + settings["metric_constraints"] = settings.get("metric_constraints", []) self._estimator_type = ( "classifier" if settings["task"] in CLASSIFICATION else "regressor" ) @@ -1723,10 +1738,7 @@ class AutoML(BaseEstimator): Returns: A list of the metric constraints. """ - constraints = [] - if np.isfinite(self._pred_time_limit): - constraints.append(("pred_time", "<=", self._pred_time_limit)) - return constraints + return self._metric_constraints def fit( self, @@ -1772,6 +1784,7 @@ class AutoML(BaseEstimator): auto_augment=None, min_sample_size=None, use_ray=None, + metric_constraints=None, **fit_kwargs, ): """Find a model for a given task. @@ -1813,7 +1826,7 @@ class AutoML(BaseEstimator): def custom_metric( X_val, y_val, estimator, labels, X_train, y_train, weight_val=None, weight_train=None, - **args, + *args, ): from sklearn.metrics import log_loss import time @@ -1951,6 +1964,20 @@ class AutoML(BaseEstimator): in separate processes. This can be used to prevent OOM for large datasets, but will incur more overhead in time. Only use it if you run into OOM failures. + metric_constraints: list, default=[] | The list of metric constraints. + Each element in this list is a 3-tuple, which shall be expressed + in the following format: the first element of the 3-tuple is the name of the + metric, the second element is the inequality sign chosen from ">=" and "<=", + and the third element is the constraint value. E.g., `('precision', '>=', 0.9)`. + Note that all the metric names in metric_constraints need to be reported via + the metrics_to_log dictionary returned by a customized metric function. + The customized metric function shall be provided via the `metric` key word argument + of the fit() function or the automl constructor. + Find examples in this [test](https://github.com/microsoft/FLAML/tree/main/test/automl/test_constraints.py). + If `pred_time_limit` is provided as one of keyword arguments to fit() function or + the automl constructor, flaml will automatically (and under the hood) + add it as an additional element in the metric_constraints. Essentially 'pred_time_limit' + specifies a constraint about the prediction latency constraint in seconds. **fit_kwargs: Other key word arguments to pass to fit() function of the searched learners, such as sample_weight. Include: period: int | forecast horizon for ts_forecast tasks. @@ -1994,6 +2021,11 @@ class AutoML(BaseEstimator): mem_thres = mem_thres or self._settings.get("mem_thres") pred_time_limit = pred_time_limit or self._settings.get("pred_time_limit") train_time_limit = train_time_limit or self._settings.get("train_time_limit") + self._metric_constraints = metric_constraints or self._settings.get( + "metric_constraints" + ) + if np.isfinite(pred_time_limit): + self._metric_constraints.append(("pred_time", "<=", pred_time_limit)) verbose = self._settings.get("verbose") if verbose is None else verbose retrain_full = ( self._settings.get("retrain_full") if retrain_full is None else retrain_full diff --git a/test/automl/test_constraints.py b/test/automl/test_constraints.py new file mode 100644 index 000000000..13e395e76 --- /dev/null +++ b/test/automl/test_constraints.py @@ -0,0 +1,159 @@ +import unittest + +from sklearn.datasets import fetch_openml +from sklearn.model_selection import train_test_split +from flaml.automl import AutoML +from flaml import tune + + +dataset = "credit-g" + + +def test_metric_constraints(): + # impose metric constrains via "pred_time_limit" + automl = AutoML() + + automl_settings = { + "estimator_list": ["xgboost"], + "task": "classification", + "log_file_name": f"test/constraints_{dataset}.log", + "n_jobs": 1, + "log_type": "all", + "retrain_full": "budget", + "keep_search_state": True, + "time_budget": 1, + "pred_time_limit": 5.1e-05, + } + from sklearn.externals._arff import ArffException + + try: + X, y = fetch_openml(name=dataset, return_X_y=True) + except (ArffException, ValueError): + from sklearn.datasets import load_wine + + X, y = load_wine(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.33, random_state=42 + ) + automl.fit(X_train=X_train, y_train=y_train, **automl_settings) + print(automl.estimator_list) + print(automl.search_space) + print(automl.points_to_evaluate) + config = automl.best_config.copy() + config["learner"] = automl.best_estimator + automl.trainable(config) + + from flaml.automl import size + from functools import partial + + print("metric constraints used in automl", automl.metric_constraints) + + analysis = tune.run( + automl.trainable, + automl.search_space, + metric="val_loss", + mode="min", + low_cost_partial_config=automl.low_cost_partial_config, + points_to_evaluate=automl.points_to_evaluate, + cat_hp_cost=automl.cat_hp_cost, + resource_attr=automl.resource_attr, + min_resource=automl.min_resource, + max_resource=automl.max_resource, + time_budget_s=automl._state.time_budget, + config_constraints=[(partial(size, automl._state), "<=", automl._mem_thres)], + metric_constraints=automl.metric_constraints, + num_samples=5, + ) + print(analysis.trials[-1]) + + +def custom_metric( + X_val, + y_val, + estimator, + labels, + X_train, + y_train, + weight_val, + weight_train, + *args, +): + from sklearn.metrics import log_loss + import time + + start = time.time() + y_pred = estimator.predict_proba(X_val) + pred_time = (time.time() - start) / len(X_val) + val_loss = log_loss(y_val, y_pred, labels=labels, sample_weight=weight_val) + y_pred = estimator.predict_proba(X_train) + train_loss = log_loss(y_train, y_pred, labels=labels, sample_weight=weight_train) + alpha = 0.5 + return val_loss * (1 + alpha) - alpha * train_loss, { + "val_loss": val_loss, + "val_train_loss_gap": val_loss - train_loss, + "pred_time": pred_time, + } + + +def test_metric_constraints_custom(): + automl = AutoML() + + automl_settings = { + "estimator_list": ["xgboost"], + "task": "classification", + "log_file_name": f"test/constraints_custom_{dataset}.log", + "n_jobs": 1, + "metric": custom_metric, + "log_type": "all", + "retrain_full": "budget", + "keep_search_state": True, + "time_budget": 1, + "metric_constraints": [ + ("pred_time", "<=", 5.1e-05), + ("val_train_loss_gap", "<=", 0.05), + ], + } + from sklearn.externals._arff import ArffException + + try: + X, y = fetch_openml(name=dataset, return_X_y=True) + except (ArffException, ValueError): + from sklearn.datasets import load_wine + + X, y = load_wine(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.33, random_state=42 + ) + automl.fit(X_train=X_train, y_train=y_train, **automl_settings) + print(automl.estimator_list) + print(automl.search_space) + print(automl.points_to_evaluate) + config = automl.best_config.copy() + config["learner"] = automl.best_estimator + automl.trainable(config) + + from flaml.automl import size + from functools import partial + + print("metric constraints in automl", automl.metric_constraints) + analysis = tune.run( + automl.trainable, + automl.search_space, + metric="val_loss", + mode="min", + low_cost_partial_config=automl.low_cost_partial_config, + points_to_evaluate=automl.points_to_evaluate, + cat_hp_cost=automl.cat_hp_cost, + resource_attr=automl.resource_attr, + min_resource=automl.min_resource, + max_resource=automl.max_resource, + time_budget_s=automl._state.time_budget, + config_constraints=[(partial(size, automl._state), "<=", automl._mem_thres)], + metric_constraints=automl.metric_constraints, + num_samples=5, + ) + print(analysis.trials[-1]) + + +if __name__ == "__main__": + unittest.main()