diff --git a/flaml/automl.py b/flaml/automl.py index 8df3543eb..1ec2cdcd2 100644 --- a/flaml/automl.py +++ b/flaml/automl.py @@ -433,10 +433,8 @@ class AutoML(BaseEstimator): ): return metric_to_minimize, metrics_to_log ``` - which returns a float number as the minimization objective, and a dictionary as the metrics to log. E.g., - ```python def custom_metric( X_val, y_val, estimator, labels, @@ -468,7 +466,6 @@ class AutoML(BaseEstimator): set it to be an empty string "". estimator_list: A list of strings for estimator names, or 'auto' e.g., ```['lgbm', 'xgboost', 'xgb_limitdepth', 'catboost', 'rf', 'extra_tree']``` - time_budget: A float number of the time budget in seconds. Use -1 if no time limit. max_iter: An integer of the maximal number of iterations. @@ -531,7 +528,6 @@ class AutoML(BaseEstimator): `automl` object and use them in the `new_automl` object. e.g., - ```python from flaml import AutoML automl = AutoML() @@ -1717,7 +1713,6 @@ class AutoML(BaseEstimator): 'mape'. Default is 'auto'. If passing a customized metric function, the function needs to have the follwing signature: - ```python def custom_metric( X_test, y_test, estimator, labels, @@ -1726,33 +1721,30 @@ class AutoML(BaseEstimator): ): return metric_to_minimize, metrics_to_log ``` - which returns a float number as the minimization objective, and a dictionary as the metrics to log. E.g., + ```python + def custom_metric( + X_val, y_val, estimator, labels, + X_train, y_train, weight_val=None, weight_train=None, + **args, + ): + from sklearn.metrics import log_loss + import time - .. code-block:: python - - def custom_metric( - X_val, y_val, estimator, labels, - X_train, y_train, weight_val=None, weight_train=None, - **args, - ): - from sklearn.metrics import log_loss - import time - - start = time.time() - y_pred = estimator.predict_proba(X_val) - pred_time = (time.time() - start) / len(X_val) - val_loss = log_loss(y_val, y_pred, labels=labels, sample_weight=weight_val) - y_pred = estimator.predict_proba(X_train) - train_loss = log_loss(y_train, y_pred, labels=labels, sample_weight=weight_train) - alpha = 0.5 - return val_loss * (1 + alpha) - alpha * train_loss, { - "val_loss": val_loss, - "train_loss": train_loss, - "pred_time": pred_time, - } - + start = time.time() + y_pred = estimator.predict_proba(X_val) + pred_time = (time.time() - start) / len(X_val) + val_loss = log_loss(y_val, y_pred, labels=labels, sample_weight=weight_val) + y_pred = estimator.predict_proba(X_train) + train_loss = log_loss(y_train, y_pred, labels=labels, sample_weight=weight_train) + alpha = 0.5 + return val_loss * (1 + alpha) - alpha * train_loss, { + "val_loss": val_loss, + "train_loss": train_loss, + "pred_time": pred_time, + } + ``` task: A string of the task type, e.g., 'classification', 'regression', 'ts_forecast', 'rank', 'seq-classification', 'seq-regression', 'summarization' diff --git a/test/automl/test_classification.py b/test/automl/test_classification.py index 29507e08f..66d8cdfd2 100644 --- a/test/automl/test_classification.py +++ b/test/automl/test_classification.py @@ -2,6 +2,7 @@ import unittest import numpy as np import scipy.sparse from sklearn.datasets import load_breast_cancer +from sklearn.model_selection import train_test_split import pandas as pd from datetime import datetime from flaml import AutoML @@ -221,14 +222,28 @@ class TestClassification(unittest.TestCase): print(automl_experiment.best_estimator) def test_ray_classification(self): - from sklearn.datasets import make_classification + X, y = load_breast_cancer(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) - X, y = make_classification(1000, 10) automl = AutoML() try: - automl.fit(X, y, time_budget=10, task="classification", use_ray=True) automl.fit( - X, y, time_budget=10, task="classification", n_concurrent_trials=2 + X_train, + y_train, + X_val=X_test, + y_val=y_test, + time_budget=10, + task="classification", + use_ray=True, + ) + automl.fit( + X_train, + y_train, + X_val=X_test, + y_val=y_test, + time_budget=10, + task="classification", + n_concurrent_trials=2, ) except ImportError: return diff --git a/test/ray/distribute_tune.py b/test/ray/distribute_tune.py index 04120233b..ee3580d0b 100644 --- a/test/ray/distribute_tune.py +++ b/test/ray/distribute_tune.py @@ -1,30 +1,28 @@ import ray import lightgbm as lgb import numpy as np -import sklearn.datasets -import sklearn.metrics +from sklearn.datasets import load_breast_cancer +from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from flaml import tune from flaml.model import LGBMEstimator -data, target = sklearn.datasets.load_breast_cancer(return_X_y=True) -train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.25) +X, y = load_breast_cancer(return_X_y=True) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) def train_breast_cancer(config): params = LGBMEstimator(**config).params - train_set = lgb.Dataset(train_x, label=train_y) + train_set = lgb.Dataset(X_train, label=y_train) gbm = lgb.train(params, train_set) - preds = gbm.predict(test_x) + preds = gbm.predict(X_test) pred_labels = np.rint(preds) - tune.report( - mean_accuracy=sklearn.metrics.accuracy_score(test_y, pred_labels), done=True - ) + tune.report(mean_accuracy=accuracy_score(y_test, pred_labels), done=True) if __name__ == "__main__": ray.init(address="auto") - flaml_lgbm_search_space = LGBMEstimator.search_space(train_x.shape) + flaml_lgbm_search_space = LGBMEstimator.search_space(X_train.shape) config_search_space = { hp: space["domain"] for hp, space in flaml_lgbm_search_space.items() } diff --git a/test/tune.py b/test/tune_example.py similarity index 87% rename from test/tune.py rename to test/tune_example.py index 8b67b2b31..012c70945 100644 --- a/test/tune.py +++ b/test/tune_example.py @@ -36,6 +36,14 @@ low_cost_partial_config = { for hp, space in flaml_lgbm_search_space.items() if "low_cost_init_value" in space } +# initial points to evaluate +points_to_evaluate = [ + { + hp: space["init_value"] + for hp, space in flaml_lgbm_search_space.items() + if "init_value" in space + } +] # run the tuning, minimizing mse, with total time budget 3 seconds analysis = tune.run( train_lgbm, @@ -43,6 +51,7 @@ analysis = tune.run( mode="min", config=config_search_space, low_cost_partial_config=low_cost_partial_config, + points_to_evaluate=points_to_evaluate, time_budget_s=3, num_samples=-1, ) diff --git a/website/docs/Getting-Started.md b/website/docs/Getting-Started.md index f327251ca..755384abf 100644 --- a/website/docs/Getting-Started.md +++ b/website/docs/Getting-Started.md @@ -74,7 +74,7 @@ analysis = tune.run( low_cost_partial_config=low_cost_partial_config, time_budget_s=3, num_samples=-1, ) ``` -Please see this [script](https://github.com/microsoft/FLAML/blob/main/test/tune.py) for the complete version of the above example. +Please see this [script](https://github.com/microsoft/FLAML/blob/main/test/tune_example.py) for the complete version of the above example. ### Where to Go Next?