diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index e1bb38d6a..f20161d19 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -38,6 +38,10 @@ jobs: run: | python -m pip install --upgrade pip pip install -e .[test] + - name: If linux or max, install ray + if: matrix.os == 'macOS-latest' or 'ubuntu-latest' + run: | + pip install -e .[ray] - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names diff --git a/.gitignore b/.gitignore index b57bcdb36..e260173d3 100644 --- a/.gitignore +++ b/.gitignore @@ -148,3 +148,4 @@ dmypy.json cython_debug/ /catboost_info notebook/*.pkl +notebook/.azureml diff --git a/LICENSE b/LICENSE index 9e841e7a2..bfe16f3d3 100644 --- a/LICENSE +++ b/LICENSE @@ -19,3 +19,22 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE + +------------- +Code in tune/[analysis.py, sample.py, trial.py] and +searcher/[suggestion.py, variant_generator.py] is adapted from +https://github.com/ray-project/ray/blob/master/python/ray/tune/ + +# Copyright 2020 The Ray Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/README.md b/README.md index 97014d8f0..5d9e04fd7 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,17 @@ # FLAML - Fast and Lightweight AutoML +

+ +
+

+ FLAML is a Python library designed to automatically produce accurate machine learning models with low computational cost. It frees users from selecting learners and hyperparameters for each learner. It is fast and cheap. The simple and lightweight design makes it easy to extend, such as -adding customized learners or metrics. FLAML is powered by a new, cost-effective -hyperparameter optimization and learner selection method invented by -Microsoft Research. +adding customized learners or metrics. FLAML is powered by a new, [cost-effective +hyperparameter optimization](https://github.com/microsoft/FLAML/tree/main/flaml/tune) +and learner selection method invented by Microsoft Research. FLAML is easy to use: * With three lines of code, you can start using this economical and fast @@ -23,10 +28,10 @@ tool for XGBoost, LightGBM, Random Forest etc. or a customized learner. automl.fit(X_train, y_train, task="classification", estimator_list=["lgbm"]) ``` -* You can embed FLAML in self-tuning software for just-in-time tuning with -low latency & resource consumption. +* You can also run generic ray-tune style hyperparameter tuning for a custom function. ```python -automl.fit(X_train, y_train, task="regression", time_budget=60) +from flaml import tune +tune.run(train_with_config, config={…}, init_config={…}, time_budget_s=3600) ``` ## Installation @@ -51,9 +56,9 @@ A basic classification example. ```python from flaml import AutoML from sklearn.datasets import load_iris -# Initialize the FLAML learner. +# Initialize an AutoML instance automl = AutoML() -# Provide configurations. +# Specify automl goal and constraint automl_settings = { "time_budget": 10, # in seconds "metric": 'accuracy', @@ -61,12 +66,12 @@ automl_settings = { "log_file_name": "test/iris.log", } X_train, y_train = load_iris(return_X_y=True) -# Train with labeled input data. +# Train with labeled input data automl.fit(X_train=X_train, y_train=y_train, **automl_settings) # Predict print(automl.predict_proba(X_train)) -# Export the best model. +# Export the best model print(automl.model) ``` @@ -75,9 +80,9 @@ A basic regression example. ```python from flaml import AutoML from sklearn.datasets import load_boston -# Initialize the FLAML learner. +# Initialize an AutoML instance automl = AutoML() -# Provide configurations. +# Specify automl goal and constraint automl_settings = { "time_budget": 10, # in seconds "metric": 'r2', @@ -85,25 +90,39 @@ automl_settings = { "log_file_name": "test/boston.log", } X_train, y_train = load_boston(return_X_y=True) -# Train with labeled input data. +# Train with labeled input data automl.fit(X_train=X_train, y_train=y_train, **automl_settings) # Predict print(automl.predict(X_train)) -# Export the best model. +# Export the best model print(automl.model) ``` -More examples: see the [notebook](https://github.com/microsoft/FLAML/tree/main/notebook/flaml_demo.ipynb) +More examples can be found in [notebooks](https://github.com/microsoft/FLAML/tree/main/notebook/). ## Documentation The API documentation is [here](https://microsoft.github.io/FLAML/). +Read more about the +hyperparameter optimization methods +in FLAML [here](https://github.com/microsoft/FLAML/tree/main/flaml/tune). They can be used beyond the AutoML context. +And they can be used in distributed HPO frameworks such as ray tune or nni. + For more technical details, please check our papers. -* [FLAML: A Fast and Lightweight AutoML Library](https://arxiv.org/abs/1911.04706). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. arXiv:1911.04706, 2020. -* [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. To appear in AAAI 2021. +* [FLAML: A Fast and Lightweight AutoML Library](https://arxiv.org/abs/1911.04706). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. To appear in MLSys, 2021. +``` +@inproceedings{wang2021flaml, + title={Frugal Optimization for Cost-related Hyperparameters}, + author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu}, + year={2021}, + booktitle={MLSys}, +} +``` +* [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021. +* Economical Hyperparameter Optimization With Blended Search Strategy. Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. To appear in ICLR 2021. ## Contributing @@ -123,9 +142,8 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio * Chi Wang * Qingyun Wu -* Erkang Zhu -Contributors: Markus Weimer, Silu Huang, Haozhe Zhang, Alex Deng. +Contributors (alphabetical order): Alex Deng, Silu Huang, John Langford, Amin Saied, Markus Weimer, Haozhe Zhang, Erkang Zhu. ## License diff --git a/docs/images/BlendSearch.png b/docs/images/BlendSearch.png new file mode 100644 index 000000000..b789e4f72 Binary files /dev/null and b/docs/images/BlendSearch.png differ diff --git a/docs/images/CFO.png b/docs/images/CFO.png new file mode 100644 index 000000000..c377489f3 Binary files /dev/null and b/docs/images/CFO.png differ diff --git a/docs/images/FLAML.png b/docs/images/FLAML.png new file mode 100644 index 000000000..a3d593d17 Binary files /dev/null and b/docs/images/FLAML.png differ diff --git a/docs/images/heatmap_cost_cfo_12s.gif b/docs/images/heatmap_cost_cfo_12s.gif new file mode 100644 index 000000000..5093f9c80 Binary files /dev/null and b/docs/images/heatmap_cost_cfo_12s.gif differ diff --git a/docs/images/heatmap_loss_cfo_12s.gif b/docs/images/heatmap_loss_cfo_12s.gif new file mode 100644 index 000000000..9cc0968b4 Binary files /dev/null and b/docs/images/heatmap_loss_cfo_12s.gif differ diff --git a/flaml/__init__.py b/flaml/__init__.py index b9c94a146..89b9eec72 100644 --- a/flaml/__init__.py +++ b/flaml/__init__.py @@ -1,3 +1,4 @@ +from flaml.searcher import CFO, BlendSearch, FLOW2 from flaml.automl import AutoML from flaml.version import __version__ import logging diff --git a/flaml/automl.py b/flaml/automl.py index 82cdf6897..07c0ecd23 100644 --- a/flaml/automl.py +++ b/flaml/automl.py @@ -1,910 +1,1211 @@ -'''! - * Copyright (c) 2020 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the - * project root for license information. -''' -import time -import warnings -from functools import partial -import ast -import numpy as np -import scipy.sparse -from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \ - RepeatedKFold -from sklearn.utils import shuffle -import pandas as pd - -from .ml import compute_estimator, train_estimator, get_classification_objective -from .config import MIN_SAMPLE_TRAIN, MEM_THRES, ETI_INI, \ - SMALL_LARGE_THRES, CV_HOLDOUT_THRESHOLD, SPLIT_RATIO, N_SPLITS -from .data import concat -from .search import ParamSearch -from .training_log import training_log_reader, training_log_writer - -import logging -logger = logging.getLogger(__name__) - - -class AutoML: - '''The AutoML class - - Example: - - .. code-block:: python - - automl = AutoML() - automl_settings = { - "time_budget": 60, - "metric": 'accuracy', - "task": 'classification', - "log_file_name": 'test/mylog.log', - } - automl.fit(X_train = X_train, y_train = y_train, - **automl_settings) - - ''' - - def __init__(self): - self._eti_ini = ETI_INI - self._custom_learners = {} - self._config_space_info = {} - self._custom_size_estimate = {} - self._track_iter = 0 - - @property - def model_history(self): - '''A dictionary of iter->model, storing the models when - the best model is updated each time. - ''' - return self._model_history - - @property - def config_history(self): - '''A dictionary of iter->(estimator, config, time), - storing the best estimator, config, and the time when the best - model is updated each time. - ''' - return self._config_history - - @property - def model(self): - '''An object with `predict()` and `predict_proba()` method (for - classification), storing the best trained model. - ''' - if self._trained_estimator: - return self._trained_estimator.model - else: - return None - - @property - def best_estimator(self): - '''A string indicating the best estimator found.''' - return self._best_estimator - - @property - def best_iteration(self): - '''An integer of the iteration number where the best - config is found.''' - return self._best_iteration - - @property - def best_config(self): - '''A dictionary of the best configuration.''' - return self._selected.best_config[0] - - @property - def best_loss(self): - return self._best_loss - - @property - def best_config_train_time(self): - '''A float of the seconds taken by training the - best config.''' - return self.best_train_time - - @property - def classes_(self): - '''A list of n_classes elements for class labels.''' - if self.label_transformer: - return self.label_transformer.classes_.tolist() - if self._trained_estimator: - return self._trained_estimator.model.classes_.tolist() - return None - - def predict(self, X_test): - '''Predict label from features. - - Args: - X_test: A numpy array of featurized instances, shape n * m. - - Returns: - A numpy array of shape n * 1 - - each element is a predicted class - label for an instance. - ''' - X_test = self.preprocess(X_test) - y_pred = self._trained_estimator.predict(X_test) - if y_pred.ndim > 1: - y_pred = y_pred.flatten() - if self.label_transformer: - return self.label_transformer.inverse_transform(pd.Series( - y_pred)) - else: - return y_pred - - def predict_proba(self, X_test): - '''Predict the probability of each class from features, only works for - classification problems. - - Args: - X_test: A numpy array of featurized instances, shape n * m. - - Returns: - A numpy array of shape n * c. c is the # classes. Each element at - (i, j) is the probability for instance i to be in class j. - ''' - X_test = self.preprocess(X_test) - proba = self._trained_estimator.predict_proba(X_test) - return proba - - def preprocess(self, X): - if scipy.sparse.issparse(X): - X = X.tocsr() - if self.transformer: - X = self.transformer.transform(X) - return X - - def _validate_data(self, X_train_all, y_train_all, dataframe, label, - X_val=None, y_val=None): - if X_train_all is not None and y_train_all is not None: - if not (isinstance(X_train_all, np.ndarray) - or scipy.sparse.issparse(X_train_all) - or isinstance(X_train_all, pd.DataFrame) - ): - raise ValueError( - "X_train_all must be a numpy array, a pandas dataframe, " - "or Scipy sparse matrix.") - if not (isinstance(y_train_all, np.ndarray) - or isinstance(y_train_all, pd.Series)): - raise ValueError( - "y_train_all must be a numpy array or a pandas series.") - if X_train_all.size == 0 or y_train_all.size == 0: - raise ValueError("Input data must not be empty.") - if isinstance(y_train_all, np.ndarray): - y_train_all = y_train_all.flatten() - if X_train_all.shape[0] != y_train_all.shape[0]: - raise ValueError( - "# rows in X_train must match length of y_train.") - self.df = isinstance(X_train_all, pd.DataFrame) - self.nrow, self.ndim = X_train_all.shape - X, y = X_train_all, y_train_all - elif dataframe is not None and label is not None: - if not isinstance(dataframe, pd.DataFrame): - raise ValueError("dataframe must be a pandas DataFrame") - if not label in dataframe.columns: - raise ValueError("label must a column name in dataframe") - self.df = True - self.dataframe, self.label = dataframe, label - X = dataframe.drop(columns=label) - self.nrow, self.ndim = X.shape - y = dataframe[label] - else: - raise ValueError( - "either X_train_all+y_train_all or dataframe+label need to be provided.") - if scipy.sparse.issparse(X_train_all): - self.transformer = self.label_transformer = False - self.X_train_all, self.y_train_all = X, y - else: - from .data import DataTransformer - self.transformer = DataTransformer() - self.X_train_all, self.y_train_all = self.transformer.fit_transform( - X, y, self.task) - self.label_transformer = self.transformer.label_transformer - - if X_val is not None and y_val is not None: - if not (isinstance(X_val, np.ndarray) - or scipy.sparse.issparse(X_val) - or isinstance(X_val, pd.DataFrame) - ): - raise ValueError( - "X_val must be None, a numpy array, a pandas dataframe, " - "or Scipy sparse matrix.") - if not (isinstance(y_val, np.ndarray) - or isinstance(y_val, pd.Series)): - raise ValueError( - "y_val must be None, a numpy array or a pandas series.") - if X_val.size == 0 or y_val.size == 0: - raise ValueError( - "Validation data are expected to be nonempty. " - "Use None for X_val and y_val if no validation data.") - if isinstance(y_val, np.ndarray): - y_val = y_val.flatten() - if X_val.shape[0] != y_val.shape[0]: - raise ValueError( - "# rows in X_val must match length of y_val.") - if self.transformer: - self.X_val = self.transformer.transform(X_val) - else: - self.X_val = X_val - if self.label_transformer: - self.y_val = self.label_transformer.transform(y_val) - else: - self.y_val = y_val - else: - self.X_val = self.y_val = None - - def _prepare_data(self, - eval_method, - split_ratio, - n_splits): - X_val, y_val = self.X_val, self.y_val - if scipy.sparse.issparse(X_val): - X_val = X_val.tocsr() - X_train_all, y_train_all = self.X_train_all, self.y_train_all - if scipy.sparse.issparse(X_train_all): - X_train_all = X_train_all.tocsr() - - if self.task != 'regression': - # logger.info(f"label {pd.unique(y_train_all)}") - label_set, counts = np.unique(y_train_all, return_counts=True) - # augment rare classes - rare_threshld = 20 - rare = counts < rare_threshld - rare_label, rare_counts = label_set[rare], counts[rare] - for i, label in enumerate(rare_label): - count = rare_count = rare_counts[i] - rare_index = y_train_all == label - n = len(y_train_all) - while count < rare_threshld: - if self.df: - X_train_all = concat(X_train_all, - X_train_all.iloc[:n].loc[rare_index]) - else: - X_train_all = concat(X_train_all, - X_train_all[:n][rare_index, :]) - if isinstance(y_train_all, pd.Series): - y_train_all = concat(y_train_all, - y_train_all.iloc[:n].loc[rare_index]) - else: - y_train_all = np.concatenate([y_train_all, - y_train_all[:n][rare_index]]) - count += rare_count - logger.debug( - f"class {label} augmented from {rare_count} to {count}") - X_train_all, y_train_all = shuffle( - X_train_all, y_train_all, random_state=202020) - if self.df: - X_train_all.reset_index(drop=True, inplace=True) - if isinstance(y_train_all, pd.Series): - y_train_all.reset_index(drop=True, inplace=True) - - X_train, y_train = X_train_all, y_train_all - if X_val is None: - if self.task != 'regression' and eval_method == 'holdout': - label_set, first = np.unique(y_train_all, return_index=True) - rest = [] - last = 0 - first.sort() - for i in range(len(first)): - rest.extend(range(last, first[i])) - last = first[i] + 1 - rest.extend(range(last, len(y_train_all))) - X_first = X_train_all.iloc[first] if self.df else X_train_all[ - first] - X_rest = X_train_all.iloc[rest] if self.df else X_train_all[rest] - y_rest = y_train_all.iloc[rest] if isinstance( - y_train_all, pd.Series) else y_train_all[rest] - stratify = y_rest if self.split_type == 'stratified' else None - X_train, X_val, y_train, y_val = train_test_split( - X_rest, - y_rest, - test_size=split_ratio, - stratify=stratify, - random_state=1) - X_train = concat(X_first, X_train) - y_train = concat(label_set, - y_train) if self.df else np.concatenate([label_set, y_train]) - X_val = concat(X_first, X_val) - y_val = concat(label_set, - y_val) if self.df else np.concatenate([label_set, y_val]) - _, y_train_counts_elements = np.unique(y_train, - return_counts=True) - _, y_val_counts_elements = np.unique(y_val, - return_counts=True) - logger.debug( - f"""{self.split_type} split for y_train \ - {y_train_counts_elements}, \ - y_val {y_val_counts_elements}""") - elif eval_method == 'holdout' and self.task == 'regression': - X_train, X_val, y_train, y_val = train_test_split( - X_train_all, - y_train_all, - test_size=split_ratio, - random_state=1) - self.data_size = X_train.shape[0] - self.X_train, self.y_train, self.X_val, self.y_val = ( - X_train, y_train, X_val, y_val) - if self.split_type == "stratified": - logger.info("Using StratifiedKFold") - self.kf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=1, - random_state=202020) - else: - logger.info("Using RepeatedKFold") - self.kf = RepeatedKFold(n_splits=n_splits, n_repeats=1, - random_state=202020) - - def prepare_sample_train_data(self, sample_size): - full_size = len(self.y_train) - if sample_size <= full_size: - if isinstance(self.X_train, pd.DataFrame): - sampled_X_train = self.X_train.iloc[:sample_size] - else: - sampled_X_train = self.X_train[:sample_size] - sampled_y_train = self.y_train[:sample_size] - else: - sampled_X_train = concat(self.X_train, self.X_val) - sampled_y_train = np.concatenate([self.y_train, self.y_val]) - return sampled_X_train, sampled_y_train - - def _compute_with_config_base(self, - metric, - compute_train_loss, - estimator, - config, - sample_size): - sampled_X_train, sampled_y_train = self.prepare_sample_train_data( - sample_size) - time_left = self.time_budget - self.time_from_start - budget = time_left if sample_size == self.data_size else \ - time_left / 2 * sample_size / self.data_size - return compute_estimator(sampled_X_train, - sampled_y_train, - self.X_val, - self.y_val, - budget, - self.kf, - config, - self.task, - estimator, - self.eval_method, - metric, - self._best_loss, - self.n_jobs, - self._custom_learners.get(estimator), - compute_train_loss) - - def _train_with_config(self, estimator, config, sample_size): - sampled_X_train, sampled_y_train = self.prepare_sample_train_data( - sample_size) - budget = None if self.time_budget is None else (self.time_budget - - self.time_from_start) - model, train_time = train_estimator( - sampled_X_train, - sampled_y_train, - config, - self.task, - estimator, - self.n_jobs, - self._custom_learners.get(estimator), - budget) - return model, train_time - - def add_learner(self, - learner_name, - learner_class): - '''Add a customized learner - - Args: - learner_name: A string of the learner's name - learner_class: A subclass of BaseEstimator - ''' - self._custom_learners[learner_name] = learner_class - cost_relative2lgbm = 1 - # cost_relative2lgbm: A float number for the training cost ratio with - # respect to lightgbm(when both use the initial config) - self._eti_ini[learner_name] = cost_relative2lgbm - self._config_space_info[learner_name] = \ - learner_class.params_configsearch_info - # size_estimate: A function from a config to its memory size in float - size_estimate = lambda config: 1.0 - self._custom_size_estimate[learner_name] = size_estimate - - def get_estimator_from_log(self, log_file_name, record_id, objective): - '''Get the estimator from log file - - Args: - log_file_name: A string of the log file name - record_id: An integer of the record ID in the file, - 0 corresponds to the first trial - objective: A string of the objective name, - 'binary', 'multi', or 'regression' - - Returns: - An estimator object for the given configuration - ''' - - with training_log_reader(log_file_name) as reader: - record = reader.get_record(record_id) - estimator = record.learner - config = record.config - - estimator, _ = train_estimator( - None, None, config, objective, estimator, - estimator_class=self._custom_learners.get(estimator) - ) - return estimator - - def retrain_from_log(self, - log_file_name, - X_train=None, - y_train=None, - dataframe=None, - label=None, - time_budget=0, - task='classification', - eval_method='auto', - split_ratio=SPLIT_RATIO, - n_splits=N_SPLITS, - split_type="stratified", - n_jobs=1, - train_best=True, - train_full=False, - record_id=-1): - '''Retrain from log file - - Args: - time_budget: A float number of the time budget in seconds - log_file_name: A string of the log file name - X_train: A numpy array of training data in shape n * m - y_train: A numpy array of labels in shape n * 1 - task: A string of the task type, e.g., - 'classification', 'regression' - eval_method: A string of resampling strategy, one of - ['auto', 'cv', 'holdout'] - split_ratio: A float of the validation data percentage for holdout - n_splits: An integer of the number of folds for cross - validation - n_jobs: An integer of the number of threads for training - train_best: A boolean of whether to train the best config in the - time budget; if false, train the last config in the budget - train_full: A boolean of whether to train on the full data. If true, - eval_method and sample_size in the log file will be ignored - record_id: the ID of the training log record from which the model will - be retrained. By default `record_id = -1` which means this will be - ignored. `record_id = 0` corresponds to the first trial, and - when `record_id >= 0`, `time_budget` will be ignored. - ''' - self.task = task - self._validate_data(X_train, y_train, dataframe, label) - - logger.info('log file name {}'.format(log_file_name)) - - best_config = None - best_val_loss = float('+inf') - best_estimator = None - sample_size = None - time_used = 0.0 - training_duration = 0 - best = None - with training_log_reader(log_file_name) as reader: - if record_id >= 0: - best = reader.get_record(record_id) - else: - for record in reader.records(): - time_used = record.total_search_time - if time_used > time_budget: - break - training_duration = time_used - val_loss = record.validation_loss - if val_loss <= best_val_loss or not train_best: - if val_loss == best_val_loss and train_best: - size = record.sample_size - if size > sample_size: - best = record - best_val_loss = val_loss - sample_size = size - else: - best = record - size = record.sample_size - best_val_loss = val_loss - sample_size = size - if not training_duration: - from .model import BaseEstimator - self._trained_estimator = BaseEstimator() - self._trained_estimator.model = None - return training_duration - if not best: - return - best_estimator = best.learner - best_config = best.config - sample_size = len(self.y_train_all) if train_full \ - else best.sample_size - - logger.info( - 'estimator = {}, config = {}, #training instances = {}'.format( - best_estimator, best_config, sample_size)) - # Partially copied from fit() function - # Initilize some attributes required for retrain_from_log - np.random.seed(0) - self.task = task - if self.task == 'classification': - self.task = get_classification_objective( - len(np.unique(self.y_train_all))) - assert split_type in ["stratified", "uniform"] - self.split_type = split_type - else: - self.split_type = "uniform" - if record_id >= 0: - eval_method = 'cv' - elif eval_method == 'auto': - eval_method = self._decide_eval_method(time_budget) - self.modelcount = 0 - self._prepare_data(eval_method, split_ratio, n_splits) - self.time_budget = None - self.n_jobs = n_jobs - self._trained_estimator = self._train_with_config( - best_estimator, best_config, sample_size)[0] - return training_duration - - def _decide_eval_method(self, time_budget): - if self.X_val is not None: - return 'holdout' - nrow, dim = self.nrow, self.ndim - if nrow * dim / 0.9 < SMALL_LARGE_THRES * ( - time_budget / 3600) and nrow < CV_HOLDOUT_THRESHOLD: - # time allows or sampling can be used and cv is necessary - return 'cv' - else: - return 'holdout' - - def fit(self, - X_train=None, - y_train=None, - dataframe=None, - label=None, - metric='auto', - task='classification', - n_jobs=-1, - log_file_name='default.log', - estimator_list='auto', - time_budget=60, - max_iter=1000000, - sample=True, - ensemble=False, - eval_method='auto', - log_type='better', - model_history=False, - split_ratio=SPLIT_RATIO, - n_splits=N_SPLITS, - log_training_metric=False, - mem_thres=MEM_THRES, - X_val=None, - y_val=None, - retrain_full=True, - split_type="stratified", - learner_selector='sample', - ): - '''Find a model for a given task - - Args: - X_train: A numpy array or a pandas dataframe of training data in - shape n * m - y_train: A numpy array or a pandas series of labels in shape n * 1 - dataframe: A dataframe of training data including label column - label: A str of the label column name - Note: If X_train and y_train are provided, - dataframe and label are ignored; - If not, dataframe and label must be provided. - metric: A string of the metric name or a function, - e.g., 'accuracy', 'roc_auc', 'f1', 'log_loss', 'mae', 'mse', 'r2' - if passing a customized metric function, the function needs to - have the follwing signature: - - .. code-block:: python - - def metric(X_test, y_test, estimator, labels, X_train, y_train): - return metric_to_minimize, metrics_to_log - - which returns a float number as the minimization objective, - and a tuple of floats as the metrics to log - task: A string of the task type, e.g., - 'classification', 'regression' - n_jobs: An integer of the number of threads for training - log_file_name: A string of the log file name - estimator_list: A list of strings for estimator names, or 'auto' - e.g., - - .. code-block:: python - - ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree'] - - time_budget: A float number of the time budget in seconds - max_iter: An integer of the maximal number of iterations - sample: A boolean of whether to sample the training data during - search - eval_method: A string of resampling strategy, one of - ['auto', 'cv', 'holdout'] - split_ratio: A float of the valiation data percentage for holdout - n_splits: An integer of the number of folds for cross - validation - log_type: A string of the log type, one of - ['better', 'all', 'new'] - 'better' only logs configs with better loss than previos iters - 'all' logs all the tried configs - 'new' only logs non - redundant configs - model_history: A boolean of whether to keep the history of best - models in the history property. Make sure memory is large - enough if setting to True. - log_training_metric: A boolean of whether to log the training - metric for each model. - mem_thres: A float of the memory size constraint in bytes - X_val: None | a numpy array or a pandas dataframe of validation data - y_val: None | a numpy array or a pandas series of validation labels - ''' - self.task = task - self._validate_data(X_train, y_train, dataframe, label, X_val, y_val) - self.start_time_flag = time.time() - np.random.seed(0) - self.learner_selector = learner_selector - - if self.task == 'classification': - self.task = get_classification_objective( - len(np.unique(self.y_train_all))) - assert split_type in ["stratified", "uniform"] - self.split_type = split_type - else: - self.split_type = "uniform" - - if 'auto' == estimator_list: - estimator_list = ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree'] - if 'regression' != self.task: - estimator_list += ['lrl1', ] - logger.info( - "List of ML learners in AutoML Run: {}".format(estimator_list)) - - if eval_method == 'auto' or self.X_val is not None: - eval_method = self._decide_eval_method(time_budget) - self.eval_method = eval_method - logger.info("Evaluation method: {}".format(eval_method)) - - self.retrain_full = retrain_full and (eval_method == 'holdout' - and self.X_val is None) - self.sample = sample and (eval_method != 'cv') - if 'auto' == metric: - if 'binary' in self.task: - metric = 'roc_auc' - elif 'multi' in self.task: - metric = 'log_loss' - else: - metric = 'r2' - if metric in ['r2', 'accuracy', 'roc_auc', 'f1', 'ap']: - error_metric = f"1-{metric}" - elif isinstance(metric, str): - error_metric = metric - else: - error_metric = 'customized metric' - logger.info(f'Minimizing error metric: {error_metric}') - - with training_log_writer(log_file_name) as save_helper: - self.save_helper = save_helper - self._prepare_data(eval_method, split_ratio, n_splits) - self._compute_with_config = partial(AutoML._compute_with_config_base, - self, - metric, - log_training_metric) - self.time_budget = time_budget - self.estimator_list = estimator_list - self.ensemble = ensemble - self.max_iter = max_iter - self.mem_thres = mem_thres - self.log_type = log_type - self.split_ratio = split_ratio - self.save_model_history = model_history - self.n_jobs = n_jobs - self.search() - logger.info("fit succeeded") - - def search(self): - self.searchers = {} - # initialize the searchers - self.eti = [] - self._best_loss = float('+inf') - self.best_train_time = 0 - self.time_from_start = 0 - self.estimator_index = -1 - self._best_iteration = 0 - self._model_history = {} - self._config_history = {} - self.max_iter_per_learner = 10000 # TODO - self.iter_per_learner = dict([(e, 0) for e in self.estimator_list]) - self.fullsize = False - self._trained_estimator = None - if self.ensemble: - self.best_model = {} - for self._track_iter in range(self.max_iter): - if self.estimator_index == -1: - estimator = self.estimator_list[0] - else: - estimator = self._select_estimator(self.estimator_list) - if not estimator: - break - logger.info(f"iteration {self._track_iter}" - f" current learner {estimator}") - if estimator in self.searchers: - model = self.searchers[estimator].trained_estimator - improved = self.searchers[estimator].search1step( - global_best_loss=self._best_loss, - retrain_full=self.retrain_full, - mem_thres=self.mem_thres) - else: - model = improved = None - self.searchers[estimator] = ParamSearch( - estimator, - self.data_size, - self._compute_with_config, - self._train_with_config, - self.save_helper, - MIN_SAMPLE_TRAIN if self.sample else self.data_size, - self.task, - self.log_type, - self._config_space_info.get(estimator), - self._custom_size_estimate.get(estimator), - self.split_ratio) - self.searchers[estimator].search_begin(self.time_budget, - self.start_time_flag) - if self.estimator_index == -1: - eti_base = self._eti_ini[estimator] - self.eti.append( - self.searchers[estimator] - .expected_time_improvement_search()) - for e in self.estimator_list[1:]: - self.eti.append( - self._eti_ini[e] / eti_base * self.eti[0]) - self.estimator_index = 0 - self.time_from_start = time.time() - self.start_time_flag - # logger.info(f"{self.searchers[estimator].sample_size}, {data_size}") - if self.searchers[estimator].sample_size == self.data_size: - self.iter_per_learner[estimator] += 1 - if not self.fullsize: - self.fullsize = True - if self.searchers[estimator].best_loss < self._best_loss: - self._best_loss = self.searchers[estimator].best_loss - self._best_estimator = estimator - self.best_train_time = self.searchers[estimator].train_time - self._config_history[self._track_iter] = ( - estimator, - self.searchers[estimator].best_config[0], - self.time_from_start) - if self.save_model_history: - self._model_history[self._track_iter] = self.searchers[ - estimator].trained_estimator.model - elif self._trained_estimator: - del self._trained_estimator - self._trained_estimator = None - self._trained_estimator = self.searchers[ - estimator].trained_estimator - self._best_iteration = self._track_iter - if model and improved and not self.save_model_history: - model.cleanup() - - logger.info( - " at {:.1f}s,\tbest {}'s error={:.4f},\tbest {}'s error={:.4f}".format( - self.time_from_start, - estimator, - self.searchers[estimator].best_loss, - self._best_estimator, - self._best_loss)) - - if self.time_from_start >= self.time_budget: - break - if self.ensemble: - time_left = self.time_from_start - self.time_budget - time_ensemble = self.searchers[self._best_estimator].train_time - if time_left < time_ensemble < 2 * time_left: - break - if self.searchers[ - estimator].train_time > self.time_budget - self.time_from_start: - self.iter_per_learner[estimator] = self.max_iter_per_learner - - # Add a checkpoint for the current best config to the log. - self.save_helper.checkpoint() - - if self.searchers: - self._selected = self.searchers[self._best_estimator] - self._trained_estimator = self._selected.trained_estimator - self.modelcount = sum(self.searchers[estimator].model_count - for estimator in self.searchers) - logger.info(self._trained_estimator.model) - if self.ensemble: - searchers = list(self.searchers.items()) - searchers.sort(key=lambda x: x[1].best_loss) - estimators = [(x[0], x[1].trained_estimator) for x in searchers[ - :2]] - estimators += [(x[0], x[1].trained_estimator) for x in searchers[ - 2:] if x[1].best_loss < 4 * self._selected.best_loss] - logger.info(estimators) - if self.task != "regression": - from sklearn.ensemble import StackingClassifier as Stacker - for e in estimators: - e[1]._estimator_type = 'classifier' - else: - from sklearn.ensemble import StackingRegressor as Stacker - best_m = self._trained_estimator - stacker = Stacker(estimators, best_m, n_jobs=self.n_jobs, - passthrough=True) - stacker.fit(self.X_train_all, self.y_train_all) - self._trained_estimator = stacker - self._trained_estimator.model = stacker - else: - self._selected = self._trained_estimator = None - self.modelcount = 0 - - def __del__(self): - if hasattr(self, '_trained_estimator') and self._trained_estimator \ - and hasattr(self._trained_estimator, 'cleanup'): - self._trained_estimator.cleanup() - del self._trained_estimator - - def _select_estimator(self, estimator_list): - time_left = self.time_budget - self.time_from_start - if self.best_train_time < time_left < 2 * self.best_train_time: - best_searcher = self.searchers[self._best_estimator] - config_sig = best_searcher.get_hist_config_sig( - best_searcher.sample_size_full, - best_searcher.best_config[0]) - if config_sig not in best_searcher.config_tried: - # trainAll - return self._best_estimator - if self.learner_selector == 'roundrobin': - self.estimator_index += 1 - if self.estimator_index == len(estimator_list): - self.estimator_index = 0 - return estimator_list[self.estimator_index] - min_expected_time, selected = np.Inf, None - inv = [] - for i, estimator in enumerate(estimator_list): - if estimator in self.searchers: - searcher = self.searchers[estimator] - if self.iter_per_learner[estimator] >= self.max_iter_per_learner: - inv.append(0) - continue - eti_searcher = min(2 * searcher.train_time, - searcher.expected_time_improvement_search()) - gap = searcher.best_loss - self._best_loss - if gap > 0 and not self.ensemble: - delta_loss = searcher.old_loss - searcher.new_loss - delta_time = searcher.old_loss_time + \ - searcher.new_loss_time - searcher.old_train_time - speed = delta_loss / float(delta_time) - try: - expected_time = max(gap / speed, searcher.train_time) - except ZeroDivisionError: - warnings.warn("ZeroDivisionError: need to debug ", - "speed: {0}, " - "old_loss: {1}, " - "new_loss: {2}" - .format(speed, - searcher.old_loss, - searcher.new_loss)) - expected_time = 0.0 - expected_time = 2 * max(expected_time, eti_searcher) - else: - expected_time = eti_searcher - if expected_time == 0: - expected_time = 1e-10 - inv.append(1 / expected_time) - else: - expected_time = self.eti[i] - inv.append(0) - if expected_time < min_expected_time: - min_expected_time = expected_time - selected = estimator - if len(self.searchers) < len(estimator_list) or not selected: - if selected not in self.searchers: - # print('select',selected,'eti',min_expected_time) - return selected - s = sum(inv) - p = np.random.random() - q = 0 - for i in range(len(inv)): - if inv[i]: - q += inv[i] / s - if p < q: - return estimator_list[i] +'''! + * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the + * project root for license information. +''' +import time +import warnings +from functools import partial +import numpy as np +from scipy.sparse import issparse +from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \ + RepeatedKFold +from sklearn.utils import shuffle +import pandas as pd +import os, contextlib + +from .ml import compute_estimator, train_estimator, get_estimator_class, \ + get_classification_objective +from .config import (MIN_SAMPLE_TRAIN, MEM_THRES, RANDOM_SEED, + SMALL_LARGE_THRES, CV_HOLDOUT_THRESHOLD, SPLIT_RATIO, N_SPLITS, + SAMPLE_MULTIPLY_FACTOR) +from .data import concat +from . import tune +from .training_log import training_log_reader, training_log_writer + +import logging +logger = logging.getLogger(__name__) +try: + import mlflow +except: + mlflow = None + + +class SearchState: + + + @property + def search_space(self): + return self._search_space_domain + + @property + def estimated_cost4improvement(self): + return max(self.time_best_found-self.time_best_found_old, + self.total_time_used-self.time_best_found) + + def __init__(self, learner_class, data_size, task): + self.init_eci = learner_class.cost_relative2lgbm() + self._search_space_domain = {} + self.init_config = {} + self.cat_hp_cost = {} + self.data_size = data_size + search_space = learner_class.search_space( + data_size=data_size, task=task) + for name, space in search_space.items(): + assert 'domain' in space + self._search_space_domain[name] = space['domain'] + if 'init_value' in space: + self.init_config[name] = space['init_value'] + if 'cat_hp_cost' in space: + self.cat_hp_cost[name] = space['cat_hp_cost'] + self._hp_names = list(self._search_space_domain.keys()) + self.search_alg = None + self.best_loss = self.best_loss_old = np.inf + self.total_time_used = 0 + self.total_iter = 0 + self.base_eci = None + self.time_best_found = 0 + self.time2eval_best = 0 + self.time2eval_best_old = 0 + self.trained_estimator = None + self.update_count = 0 + self.sample_size = None + self.trial_time = 0 + + def update(self, analysis, time_used, save_model_history=False): + if not analysis.trials: return + self.update_count += 1 + result = analysis.trials[-1].last_result + if result: + config = result['config'] + # logger.info(config) + if config and 'FLAML_sample_size' in config: + self.sample_size = config['FLAML_sample_size'] + else: self.sample_size = self.data_size + obj = result['val_loss'] + train_loss = result['train_loss'] + time2eval = result['time2eval'] + trained_estimator = result[ + 'trained_estimator'] + else: + obj, time2eval, trained_estimator = np.inf, 0.0, None + train_loss = config = None + self.trial_time = time2eval + self.total_time_used += time_used + self.total_iter += 1 + + if self.base_eci is None: + self.base_eci = time_used + if (obj is not None) and (self.best_loss is None or objmodel, storing the models when + the best model is updated each time. + ''' + return self._model_history + + @property + def config_history(self): + '''A dictionary of iter->(estimator, config, time), + storing the best estimator, config, and the time when the best + model is updated each time. + ''' + return self._config_history + + @property + def model(self): + '''An object with `predict()` and `predict_proba()` method (for + classification), storing the best trained model. + ''' + if self._trained_estimator: + return self._trained_estimator.model + else: + return None + + @property + def best_estimator(self): + '''A string indicating the best estimator found.''' + return self._best_estimator + + @property + def best_iteration(self): + '''An integer of the iteration number where the best + config is found.''' + return self._best_iteration + + @property + def best_config(self): + '''A dictionary of the best configuration.''' + return self._search_states[self._best_estimator].best_config + + @property + def best_loss(self): + '''A float of the best loss found + ''' + return self._state.best_loss + + @property + def best_config_train_time(self): + '''A float of the seconds taken by training the + best config.''' + return self._search_states[self._best_estimator].best_config_train_time + + @property + def classes_(self): + '''A list of n_classes elements for class labels.''' + if self._label_transformer: + return self._label_transformer.classes_.tolist() + if self._trained_estimator: + return self._trained_estimator.model.classes_.tolist() + return None + + def predict(self, X_test): + '''Predict label from features. + + Args: + X_test: A numpy array of featurized instances, shape n * m. + + Returns: + A numpy array of shape n * 1 - - each element is a predicted class + label for an instance. + ''' + X_test = self._preprocess(X_test) + y_pred = self._trained_estimator.predict(X_test) + if y_pred.ndim > 1: y_pred = y_pred.flatten() + if self._label_transformer: + return self._label_transformer.inverse_transform(pd.Series( + y_pred)) + else: + return y_pred + + def predict_proba(self, X_test): + '''Predict the probability of each class from features, only works for + classification problems. + + Args: + X_test: A numpy array of featurized instances, shape n * m. + + Returns: + A numpy array of shape n * c. c is the # classes. Each element at + (i, j) is the probability for instance i to be in class j. + ''' + X_test = self._preprocess(X_test) + proba = self._trained_estimator.predict_proba(X_test) + return proba + + def _preprocess(self, X): + if issparse(X): + X = X.tocsr() + if self._transformer: + X = self._transformer.transform(X) + return X + + def _validate_data(self, X_train_all, y_train_all, dataframe, label, + X_val=None, y_val=None): + if X_train_all is not None and y_train_all is not None: + if not (isinstance(X_train_all, np.ndarray) or + issparse(X_train_all) or + isinstance(X_train_all, pd.DataFrame) + ): + raise ValueError( + "X_train_all must be a numpy array, a pandas dataframe, " + "or Scipy sparse matrix.") + if not (isinstance(y_train_all, np.ndarray) + or isinstance(y_train_all, pd.Series)): + raise ValueError( + "y_train_all must be a numpy array or a pandas series.") + if X_train_all.size == 0 or y_train_all.size == 0: + raise ValueError("Input data must not be empty.") + if isinstance(y_train_all, np.ndarray): + y_train_all = y_train_all.flatten() + if X_train_all.shape[0] != y_train_all.shape[0]: + raise ValueError( + "# rows in X_train must match length of y_train.") + self._df = isinstance(X_train_all, pd.DataFrame) + self._nrow, self._ndim = X_train_all.shape + X, y = X_train_all, y_train_all + elif dataframe is not None and label is not None: + if not isinstance(dataframe, pd.DataFrame): + raise ValueError("dataframe must be a pandas DataFrame") + if not label in dataframe.columns: + raise ValueError("label must a column name in dataframe") + self._df = True + X = dataframe.drop(columns=label) + self._nrow, self._ndim = X.shape + y = dataframe[label] + else: + raise ValueError( + "either X_train_all+y_train_all or dataframe+label need to be provided") + if issparse(X_train_all): + self._transformer = self._label_transformer = False + self._X_train_all, self._y_train_all = X, y + else: + from .data import DataTransformer + self._transformer = DataTransformer() + self._X_train_all, self._y_train_all = \ + self._transformer.fit_transform(X, y, self._state.task) + self._label_transformer = self._transformer.label_transformer + + if X_val is not None and y_val is not None: + if not (isinstance(X_val, np.ndarray) or + issparse(X_val) or + isinstance(X_val, pd.DataFrame) + ): + raise ValueError( + "X_val must be None, a numpy array, a pandas dataframe, " + "or Scipy sparse matrix.") + if not (isinstance(y_val, np.ndarray) + or isinstance(y_val, pd.Series)): + raise ValueError( + "y_val must be None, a numpy array or a pandas series.") + if X_val.size == 0 or y_val.size == 0: + raise ValueError( + "Validation data are expected to be nonempty. " + "Use None for X_val and y_val if no validation data.") + if isinstance(y_val, np.ndarray): + y_val = y_val.flatten() + if X_val.shape[0] != y_val.shape[0]: + raise ValueError( + "# rows in X_val must match length of y_val.") + if self._transformer: + self._state.X_val = self._transformer.transform(X_val) + else: + self._state.X_val = X_val + if self._label_transformer: + self._state.y_val = self._label_transformer.transform(y_val) + else: + self._state.y_val = y_val + else: + self._state.X_val = self._state.y_val = None + + def _prepare_data(self, + eval_method, + split_ratio, + n_splits): + X_val, y_val = self._state.X_val, self._state.y_val + if issparse(X_val): + X_val = X_val.tocsr() + X_train_all, y_train_all = \ + self._X_train_all, self._y_train_all + if issparse(X_train_all): + X_train_all = X_train_all.tocsr() + if self._state.task != 'regression': + # logger.info(f"label {pd.unique(y_train_all)}") + label_set, counts = np.unique(y_train_all, return_counts=True) + # augment rare classes + rare_threshld = 20 + rare = counts < rare_threshld + rare_label, rare_counts = label_set[rare], counts[rare] + for i, label in enumerate(rare_label): + count = rare_count = rare_counts[i] + rare_index = y_train_all == label + n = len(y_train_all) + while count < rare_threshld: + if self._df: + X_train_all = concat(X_train_all, + X_train_all.iloc[:n].loc[rare_index]) + else: + X_train_all = concat(X_train_all, + X_train_all[:n][rare_index, :]) + if isinstance(y_train_all, pd.Series): + y_train_all = concat(y_train_all, + y_train_all.iloc[:n].loc[rare_index]) + else: + y_train_all = np.concatenate([y_train_all, + y_train_all[:n][rare_index]]) + count += rare_count + logger.debug( + f"class {label} augmented from {rare_count} to {count}") + if 'sample_weight' in self._state.fit_kwargs: + X_train_all, y_train_all, self._state.fit_kwargs[ + 'sample_weight'] = shuffle( + X_train_all, y_train_all, + self._state.fit_kwargs['sample_weight'], + random_state=RANDOM_SEED) + else: + X_train_all, y_train_all = shuffle( + X_train_all, y_train_all, random_state=RANDOM_SEED) + if self._df: + X_train_all.reset_index(drop=True, inplace=True) + if isinstance(y_train_all, pd.Series): + y_train_all.reset_index(drop=True, inplace=True) + + X_train, y_train = X_train_all, y_train_all + if X_val is None: + if self._state.task != 'regression' and eval_method == 'holdout': + label_set, first = np.unique(y_train_all, return_index=True) + rest = [] + last = 0 + first.sort() + for i in range(len(first)): + rest.extend(range(last, first[i])) + last = first[i] + 1 + rest.extend(range(last, len(y_train_all))) + X_first = X_train_all.iloc[first] if self._df else X_train_all[ + first] + X_rest = X_train_all.iloc[rest] if self._df else X_train_all[rest] + y_rest = y_train_all.iloc[rest] if isinstance( + y_train_all, pd.Series) else y_train_all[rest] + stratify = y_rest if self._split_type=='stratified' else \ + None + if 'sample_weight' in self._state.fit_kwargs: + X_train, X_val, y_train, y_val, weight_train, weight_val = \ + train_test_split( + X_rest, + y_rest, + self._state.fit_kwargs['sample_weight'][rest], + test_size=split_ratio, + random_state=RANDOM_SEED) + weight1 = self._state.fit_kwargs['sample_weight'][first] + self._state.weight_val = concat(weight1, weight_val) + self._state.fit_kwargs['sample_weight'] = concat( + weight1, weight_train) + else: + X_train, X_val, y_train, y_val = train_test_split( + X_rest, + y_rest, + test_size=split_ratio, + stratify=stratify, + random_state=RANDOM_SEED) + X_train = concat(X_first, X_train) + y_train = concat(label_set, + y_train) if self._df else np.concatenate( + [label_set, y_train]) + X_val = concat(X_first, X_val) + y_val = concat(label_set, + y_val) if self._df else np.concatenate([label_set, y_val]) + _, y_train_counts_elements = np.unique(y_train, + return_counts=True) + _, y_val_counts_elements = np.unique(y_val, + return_counts=True) + logger.debug( + f"""{self._split_type} split for y_train \ + {y_train_counts_elements}, \ + y_val {y_val_counts_elements}""") + elif eval_method == 'holdout' and self._state.task == 'regression': + if 'sample_weight' in self._state.fit_kwargs: + X_train, X_val, y_train, y_val, self._state.fit_kwargs[ + 'sample_weight'], self._state.weight_val = \ + train_test_split( + X_train_all, + y_train_all, + self._state.fit_kwargs['sample_weight'], + test_size=split_ratio, + random_state=RANDOM_SEED) + else: + X_train, X_val, y_train, y_val = train_test_split( + X_train_all, + y_train_all, + test_size=split_ratio, + random_state=RANDOM_SEED) + self._state.data_size = X_train.shape[0] + if X_val is None: self.data_size_full = self._state.data_size + else: self.data_size_full = self._state.data_size + X_val.shape[0] + self._state.X_train, self._state.y_train, self._state.X_val, \ + self._state.y_val = (X_train, y_train, X_val, y_val) + if self._split_type == "stratified": + logger.info("Using StratifiedKFold") + self._state.kf = RepeatedStratifiedKFold(n_splits=n_splits, + n_repeats=1, random_state=RANDOM_SEED) + else: + logger.info("Using RepeatedKFold") + self._state.kf = RepeatedKFold(n_splits=n_splits, n_repeats=1, + random_state=RANDOM_SEED) + + def add_learner(self, + learner_name, + learner_class): + '''Add a customized learner + + Args: + learner_name: A string of the learner's name + learner_class: A subclass of flaml.model.BaseEstimator + ''' + self._state.learner_classes[learner_name] = learner_class + + def get_estimator_from_log(self, log_file_name, record_id, task): + '''Get the estimator from log file + + Args: + log_file_name: A string of the log file name + record_id: An integer of the record ID in the file, + 0 corresponds to the first trial + task: A string of the task type, + 'binary', 'multi', or 'regression' + + Returns: + An estimator object for the given configuration + ''' + + with training_log_reader(log_file_name) as reader: + record = reader.get_record(record_id) + estimator = record.learner + config = record.config + + estimator, _ = train_estimator( + None, None, config, task, estimator, + estimator_class=self._state.learner_classes.get(estimator) + ) + return estimator + + def retrain_from_log(self, + log_file_name, + X_train=None, + y_train=None, + dataframe=None, + label=None, + time_budget=0, + task='classification', + eval_method='auto', + split_ratio=SPLIT_RATIO, + n_splits=N_SPLITS, + split_type="stratified", + n_jobs=1, + train_best=True, + train_full=False, + record_id=-1, + **fit_kwargs): + '''Retrain from log file + + Args: + time_budget: A float number of the time budget in seconds + log_file_name: A string of the log file name + X_train: A numpy array of training data in shape n*m + y_train: A numpy array of labels in shape n*1 + task: A string of the task type, e.g., + 'classification', 'regression' + eval_method: A string of resampling strategy, one of + ['auto', 'cv', 'holdout'] + split_ratio: A float of the validation data percentage for holdout + n_splits: An integer of the number of folds for cross-validation + n_jobs: An integer of the number of threads for training + train_best: A boolean of whether to train the best config in the + time budget; if false, train the last config in the budget + train_full: A boolean of whether to train on the full data. If true, + eval_method and sample_size in the log file will be ignored + record_id: the ID of the training log record from which the model will + be retrained. By default `record_id = -1` which means this will be + ignored. `record_id = 0` corresponds to the first trial, and + when `record_id >= 0`, `time_budget` will be ignored. + **fit_kwargs: Other key word arguments to pass to fit() function of + the searched learners, such as sample_weight + ''' + self._state.task = task + self._state.fit_kwargs = fit_kwargs + self._validate_data(X_train, y_train, dataframe, label) + + logger.info('log file name {}'.format(log_file_name)) + + best_config = None + best_val_loss = float('+inf') + best_estimator = None + sample_size = None + time_used = 0.0 + training_duration = 0 + best = None + with training_log_reader(log_file_name) as reader: + if record_id >= 0: + best = reader.get_record(record_id) + else: + for record in reader.records(): + time_used = record.total_search_time + if time_used > time_budget: + break + training_duration = time_used + val_loss = record.validation_loss + if val_loss <= best_val_loss or not train_best: + if val_loss == best_val_loss and train_best: + size = record.sample_size + if size > sample_size: + best = record + best_val_loss = val_loss + sample_size = size + else: + best = record + size = record.sample_size + best_val_loss = val_loss + sample_size = size + if not training_duration: + from .model import BaseEstimator as Estimator + self._trained_estimator = Estimator() + self._trained_estimator.model = None + return training_duration + if not best: return + best_estimator = best.learner + best_config = best.config + sample_size = len(self._y_train_all) if train_full \ + else best.sample_size + + logger.info( + 'estimator = {}, config = {}, #training instances = {}'.format( + best_estimator, best_config, sample_size)) + # Partially copied from fit() function + # Initilize some attributes required for retrain_from_log + self._state.task = task + if self._state.task == 'classification': + self._state.task = get_classification_objective( + len(np.unique(self._y_train_all))) + assert split_type in ["stratified", "uniform"] + self._split_type = split_type + else: + self._split_type = "uniform" + if record_id >= 0: + eval_method = 'cv' + elif eval_method == 'auto': + eval_method = self._decide_eval_method(time_budget) + self.modelcount = 0 + self._prepare_data(eval_method, split_ratio, n_splits) + self._state.time_budget = None + self._state.n_jobs = n_jobs + self._trained_estimator = self._state._train_with_config( + best_estimator, best_config, sample_size)[0] + return training_duration + + def _decide_eval_method(self, time_budget): + if self._state.X_val is not None: return 'holdout' + nrow, dim = self._nrow, self._ndim + if nrow * dim / 0.9 < SMALL_LARGE_THRES * ( + time_budget / 3600) and nrow < CV_HOLDOUT_THRESHOLD: + # time allows or sampling can be used and cv is necessary + return 'cv' + else: + return 'holdout' + + def fit(self, + X_train=None, + y_train=None, + dataframe=None, + label=None, + metric='auto', + task='classification', + n_jobs=-1, + log_file_name='default.log', + estimator_list='auto', + time_budget=60, + max_iter=1000000, + sample=True, + ensemble=False, + eval_method='auto', + log_type='better', + model_history=False, + split_ratio=SPLIT_RATIO, + n_splits=N_SPLITS, + log_training_metric=False, + mem_thres=MEM_THRES, + X_val=None, + y_val=None, + sample_weight_val=None, + retrain_full=True, + split_type="stratified", + learner_selector='sample', + hpo_method=None, + **fit_kwargs): + '''Find a model for a given task + + Args: + X_train: A numpy array or a pandas dataframe of training data in + shape (n, m) + y_train: A numpy array or a pandas series of labels in shape (n,) + dataframe: A dataframe of training data including label column + label: A str of the label column name + Note: If X_train and y_train are provided, + dataframe and label are ignored; + If not, dataframe and label must be provided. + metric: A string of the metric name or a function, + e.g., 'accuracy', 'roc_auc', 'f1', 'log_loss', 'mae', 'mse', 'r2' + if passing a customized metric function, the function needs to + have the follwing signature: + + .. code-block:: python + + def custom_metric(X_test, y_test, estimator, labels, + X_train, y_train, weight_test=None, weight_train=None): + return metric_to_minimize, metrics_to_log + + which returns a float number as the minimization objective, + and a tuple of floats as the metrics to log + task: A string of the task type, e.g., + 'classification', 'regression' + n_jobs: An integer of the number of threads for training + log_file_name: A string of the log file name + estimator_list: A list of strings for estimator names, or 'auto' + e.g., + + .. code-block:: python + + ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree'] + + time_budget: A float number of the time budget in seconds + max_iter: An integer of the maximal number of iterations + sample: A boolean of whether to sample the training data during + search + eval_method: A string of resampling strategy, one of + ['auto', 'cv', 'holdout'] + split_ratio: A float of the valiation data percentage for holdout + n_splits: An integer of the number of folds for cross - validation + log_type: A string of the log type, one of + ['better', 'all'] + 'better' only logs configs with better loss than previos iters + 'all' logs all the tried configs + model_history: A boolean of whether to keep the history of best + models in the history property. Make sure memory is large + enough if setting to True. + log_training_metric: A boolean of whether to log the training + metric for each model. + mem_thres: A float of the memory size constraint in bytes + X_val: None | a numpy array or a pandas dataframe of validation data + y_val: None | a numpy array or a pandas series of validation labels + sample_weight_val: None | a numpy array of the sample weight of + validation data + **fit_kwargs: Other key word arguments to pass to fit() function of + the searched learners, such sample_weight + ''' + self._start_time_flag = time.time() + self._state.task = task + self._state.log_training_metric = log_training_metric + self._state.fit_kwargs = fit_kwargs + self._state.weight_val = sample_weight_val + self._validate_data(X_train, y_train, dataframe, label, X_val, y_val) + self._search_states = {} #key: estimator name; value: SearchState + self._random = np.random.RandomState(RANDOM_SEED) + self._learner_selector = learner_selector + if self._state.task == 'classification': + self._state.task = get_classification_objective( + len(np.unique(self._y_train_all))) + assert split_type in ["stratified", "uniform"] + self._split_type = split_type + else: + self._split_type = "uniform" + if eval_method == 'auto' or self._state.X_val is not None: + eval_method = self._decide_eval_method(time_budget) + self._state.eval_method = eval_method + logger.info("Evaluation method: {}".format(eval_method)) + + self._retrain_full = retrain_full and (eval_method == 'holdout' and + self._state.X_val is None) + self._prepare_data(eval_method, split_ratio, n_splits) + self._sample = sample and eval_method != 'cv' and ( + MIN_SAMPLE_TRAIN * SAMPLE_MULTIPLY_FACTOR < self._state.data_size) + if 'auto' == metric: + if 'binary' in self._state.task: + metric = 'roc_auc' + elif 'multi' in self._state.task: + metric = 'log_loss' + else: + metric = 'r2' + self._state.metric = metric + if metric in ['r2', 'accuracy', 'roc_auc', 'f1', 'ap']: + error_metric = f"1-{metric}" + elif isinstance(metric, str): + error_metric = metric + else: + error_metric = 'customized metric' + logger.info(f'Minimizing error metric: {error_metric}') + + if 'auto' == estimator_list: + estimator_list = ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree'] + if 'regression' != self._state.task: + estimator_list += ['lrl1',] + + # add learner using add_learner() api + for estimator_name in estimator_list: + if estimator_name not in self._state.learner_classes: + self.add_learner(estimator_name, + get_estimator_class(self._state.task, estimator_name)) + # set up learner search space + for estimator_name in estimator_list: + estimator_class = self._state.learner_classes[estimator_name] + self._search_states[estimator_name] = SearchState( + learner_class=estimator_class, + data_size=self._state.data_size, task=self._state.task, + ) + logger.info("List of ML learners in AutoML Run: {}".format( + estimator_list)) + self._hpo_method = hpo_method or 'cfo' + with training_log_writer(log_file_name) as save_helper: + self._training_log = save_helper + self._state.time_budget = time_budget + self.estimator_list = estimator_list + self._ensemble = ensemble + self._max_iter = max_iter + self._mem_thres = mem_thres + self._log_type = log_type + self.split_ratio = split_ratio + self._save_model_history = model_history + self._state.n_jobs = n_jobs + self._search() + logger.info("fit succeeded") + + def _search(self): + # initialize the search_states + self._eci = [] + self._state.best_loss = float('+inf') + self._state.time_from_start = 0 + self._estimator_index = None + self._best_iteration = 0 + self._model_history = {} + self._config_history = {} + self._max_iter_per_learner = 1000000 # TODO + self._iter_per_learner = dict([(e,0) for e in self.estimator_list]) + self._fullsize_reached = False + self._trained_estimator = None + self._best_estimator = None + self._retrained_config = {} + est_retrain_time = next_trial_time = 0 + best_config_sig = None + # use ConcurrencyLimiter to limit the amount of concurrency when + # using a search algorithm + better = True # whether we find a better model in one trial + if self._ensemble: self.best_model = {} + try: + from ray.tune.suggest import ConcurrencyLimiter + except ImportError: + from .searcher.suggestion import ConcurrencyLimiter + if self._hpo_method in ('cfo', 'grid'): + from flaml import CFO as SearchAlgo + elif 'optuna' == self._hpo_method: + try: + from ray.tune.suggest.optuna import OptunaSearch as SearchAlgo + except ImportError: + from .searcher.suggestion import OptunaSearch as SearchAlgo + elif 'bs' == self._hpo_method: + from flaml import BlendSearch as SearchAlgo + else: raise NotImplementedError + + for self._track_iter in range(self._max_iter): + if self._estimator_index is None: + estimator = self.estimator_list[0] + else: + estimator = self._select_estimator(self.estimator_list) + if not estimator: + break + logger.info(f"iteration {self._track_iter}" + f" current learner {estimator}") + search_state = self._search_states[estimator] + self._state.time_from_start = time.time()-self._start_time_flag + time_left = self._state.time_budget-self._state.time_from_start + budget_left = time_left if not self._retrain_full or better or ( + not self.best_estimator) or self._search_states[ + self.best_estimator].sample_size= self._state.time_budget or + not self.estimator_list): + break + if self._ensemble and self._best_estimator: + time_left = self._state.time_budget -self._state.time_from_start + time_ensemble = self._search_states[ + self._best_estimator].time2eval_best + if time_left < time_ensemble < 2*time_left: + break + if self._search_states[estimator].time2eval_best > \ + self._state.time_budget-self._state.time_from_start: + self._iter_per_learner[estimator] = self._max_iter_per_learner + # Add a checkpoint for the current best config to the log. + self._training_log.checkpoint() + if self._best_estimator: + self._selected = self._search_states[self._best_estimator] + self._trained_estimator = self._selected.trained_estimator + self.modelcount = sum(search_state.total_iter + for search_state in self._search_states.values()) + if self._trained_estimator: + logger.info(f'selected model: {self._trained_estimator.model}') + if self._ensemble: + search_states = list(x for x in self._search_states.items() + if x[1].trained_estimator) + search_states.sort(key=lambda x:x[1].best_loss) + estimators = [(x[0],x[1].trained_estimator) for x in search_states[ + :2]] + estimators += [(x[0],x[1].trained_estimator) for x in search_states[ + 2:] if x[1].best_loss<4*self._selected.best_loss] + logger.info(estimators) + if len(estimators)<=1: return + if self._state.task != "regression": + from sklearn.ensemble import StackingClassifier as Stacker + for e in estimators: + e[1]._estimator_type = 'classifier' + else: + from sklearn.ensemble import StackingRegressor as Stacker + best_m = self._trained_estimator + stacker = Stacker(estimators, best_m, + n_jobs=self._state.n_jobs, + passthrough=True) + stacker.fit(self._X_train_all, self._y_train_all) + logger.info(f'ensemble: {stacker}') + self._trained_estimator = stacker + self._trained_estimator.model = stacker + else: + self._selected = self._trained_estimator = None + self.modelcount = 0 + + def __del__(self): + if hasattr(self, '_trained_estimator') and self._trained_estimator \ + and hasattr(self._trained_estimator, 'cleanup'): + self._trained_estimator.cleanup() + del self._trained_estimator + + def _select_estimator(self, estimator_list): + if self._learner_selector == 'roundrobin': + self._estimator_index += 1 + if self._estimator_index == len(estimator_list): + self._estimator_index = 0 + return estimator_list[self._estimator_index] + min_estimated_cost, selected = np.Inf, None + inv = [] + untried_exists = False + for i, estimator in enumerate(estimator_list): + if estimator in self._search_states and self._search_states[ + estimator].sample_size: # sample_size=none meaning no result + search_state = self._search_states[estimator] + if self._iter_per_learner[estimator]>=self._max_iter_per_learner: + inv.append(0) + continue + eci_search_state = search_state.estimated_cost4improvement + if search_state.sample_size < self._state.data_size: + eci_search_state = min(eci_search_state, + search_state.time2eval_best * min(SAMPLE_MULTIPLY_FACTOR, + self._state.data_size/search_state.sample_size)) + gap = search_state.best_loss - self._state.best_loss + if gap > 0 and not self._ensemble: + delta_loss = (search_state.best_loss_old - + search_state.best_loss) + delta_time = (search_state.total_time_used - + search_state.time_best_found_old) + speed = delta_loss / delta_time + try: + estimated_cost = 2*gap/speed + except ZeroDivisionError: + warnings.warn("ZeroDivisionError " + "speed: {0}, " + "old_best_loss: {1}, " + "new_best_loss: {2}" + .format(speed, + search_state.best_loss_old, + search_state.best_loss)) + estimated_cost = 0.0 + estimated_cost = max(estimated_cost, eci_search_state) + else: + estimated_cost = eci_search_state + if estimated_cost == 0: + estimated_cost = 1e-10 + inv.append(1/estimated_cost) + else: + estimated_cost = self._eci[i] + inv.append(0) + untried_exists = True + if estimated_cost < min_estimated_cost: + min_estimated_cost = estimated_cost + selected = estimator + if untried_exists or not selected: + state = self._search_states.get(selected) + if not (state and state.sample_size): + return selected + s = sum(inv) + p = self._random.rand() + q = 0 + for i in range(len(inv)): + if inv[i]: + q += inv[i] / s + if p < q: + return estimator_list[i] diff --git a/flaml/config.py b/flaml/config.py index 4785f7dd3..306a8d777 100644 --- a/flaml/config.py +++ b/flaml/config.py @@ -1,31 +1,13 @@ -'''! - * Copyright (c) 2020 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. -''' - -N_SPLITS = 5 -RANDOM_SEED = 1 -SPLIT_RATIO = 0.1 -HISTORY_SIZE = 10000000 -MEM_THRES = 4*(1024**3) -SMALL_LARGE_THRES = 10000000 -MIN_SAMPLE_TRAIN = 10000 -MIN_SAMPLE_VAL = 10000 -CV_HOLDOUT_THRESHOLD = 100000 - -BASE_Const = 2 -BASE_LOWER_BOUND = 2**(0.01) - -ETI_INI = { - 'lgbm':1, - 'xgboost':1.6, - 'xgboost_nb':1.6, - 'rf':2, - 'lrl1':160, - 'lrl2':25, - 'linear_svc':16, - 'kneighbor':30, - 'catboost':15, - 'extra_tree':1.9, - 'nn':50, -} \ No newline at end of file +'''! + * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. +''' + +N_SPLITS = 5 +RANDOM_SEED = 1 +SPLIT_RATIO = 0.1 +MEM_THRES = 4*(1024**3) +SMALL_LARGE_THRES = 10000000 +MIN_SAMPLE_TRAIN = 10000 +CV_HOLDOUT_THRESHOLD = 100000 +SAMPLE_MULTIPLY_FACTOR = 4 diff --git a/flaml/data.py b/flaml/data.py index 44bc0f69d..18c29e096 100644 --- a/flaml/data.py +++ b/flaml/data.py @@ -1,5 +1,5 @@ '''! - * Copyright (c) 2020 Microsoft Corporation. All rights reserved. + * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. ''' @@ -122,7 +122,6 @@ def get_output_from_log(filename, time_budget): A list of the estimator, sample size and config of each logged iter logged_metric_list: A list of the logged metric of each logged iter ''' - import ast best_config = None best_learner = None @@ -169,13 +168,13 @@ def concat(X1, X2): '''concatenate two matrices vertically ''' if isinstance(X1, pd.DataFrame) or isinstance(X1, pd.Series): + df = pd.concat([X1, X2], sort=False) + df.reset_index(drop=True, inplace=True) if isinstance(X1, pd.DataFrame): cat_columns = X1.select_dtypes( include='category').columns - df = pd.concat([X1, X2], sort=False) - df.reset_index(drop=True, inplace=True) - if isinstance(X1, pd.DataFrame) and len(cat_columns): - df[cat_columns] = df[cat_columns].astype('category') + if len(cat_columns): + df[cat_columns] = df[cat_columns].astype('category') return df if issparse(X1): return vstack((X1, X2)) @@ -187,7 +186,8 @@ class DataTransformer: '''transform X, y ''' - def fit_transform(self, X, y, objective): + + def fit_transform(self, X, y, task): if isinstance(X, pd.DataFrame): X = X.copy() n = X.shape[0] @@ -224,9 +224,9 @@ class DataTransformer: SimpleImputer(missing_values=np.nan, strategy='median'), num_columns)]) X[num_columns] = self.transformer.fit_transform(X) - self.cat_columns, self.num_columns = cat_columns, num_columns - - if objective == 'regression': + self._cat_columns, self._num_columns = cat_columns, num_columns + + if task == 'regression': self.label_transformer = None else: from sklearn.preprocessing import LabelEncoder @@ -236,7 +236,7 @@ class DataTransformer: def transform(self, X): if isinstance(X, pd.DataFrame): - cat_columns, num_columns = self.cat_columns, self.num_columns + cat_columns, num_columns = self._cat_columns, self._num_columns X = X[cat_columns + num_columns].copy() for column in cat_columns: # print(column, X[column].dtype.name) diff --git a/flaml/ml.py b/flaml/ml.py index e198b0d35..90093cf2b 100644 --- a/flaml/ml.py +++ b/flaml/ml.py @@ -1,244 +1,273 @@ -'''! - * Copyright (c) 2020 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. -''' - -from .model import * -import time -from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \ - accuracy_score, mean_absolute_error, log_loss, average_precision_score, \ - f1_score -import numpy as np -from sklearn.model_selection import RepeatedStratifiedKFold - - -def get_estimator_class(objective_name, estimator_name): - ''' when adding a new learner, need to add an elif branch ''' - - - if 'xgboost' in estimator_name: - if 'regression' in objective_name: - estimator_class = XGBoostEstimator - else: - estimator_class = XGBoostSklearnEstimator - elif 'rf' in estimator_name: - estimator_class = RandomForestEstimator - elif 'lgbm' in estimator_name: - estimator_class = LGBMEstimator - elif 'lrl1' in estimator_name: - estimator_class = LRL1Classifier - elif 'lrl2' in estimator_name: - estimator_class = LRL2Classifier - elif 'catboost' in estimator_name: - estimator_class = CatBoostEstimator - elif 'extra_tree' in estimator_name: - estimator_class = ExtraTreeEstimator - elif 'kneighbor' in estimator_name: - estimator_class = KNeighborsEstimator - else: - raise ValueError(estimator_name + ' is not a built-in learner. ' - 'Please use AutoML.add_learner() to add a customized learner.') - return estimator_class - - -def sklearn_metric_loss_score(metric_name, y_predict, y_true, labels=None): - '''Loss using the specified metric - - Args: - metric_name: A string of the mtric name, one of - 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'log_loss', - 'f1', 'ap' - y_predict: A 1d or 2d numpy array of the predictions which can be - used to calculate the metric. E.g., 2d for log_loss and 1d - for others. - y_true: A 1d numpy array of the true labels - labels: A 1d numpy array of the unique labels - - Returns: - score: A float number of the loss, the lower the better - ''' - metric_name = metric_name.lower() - if 'r2' in metric_name: - score = 1.0 - r2_score(y_true, y_predict) - elif metric_name == 'rmse': - score = np.sqrt(mean_squared_error(y_true, y_predict)) - elif metric_name == 'mae': - score = mean_absolute_error(y_true, y_predict) - elif metric_name == 'mse': - score = mean_squared_error(y_true, y_predict) - elif metric_name == 'accuracy': - score = 1.0 - accuracy_score(y_true, y_predict) - elif 'roc_auc' in metric_name: - score = 1.0 - roc_auc_score(y_true, y_predict) - elif 'log_loss' in metric_name: - score = log_loss(y_true, y_predict, labels=labels) - elif 'f1' in metric_name: - score = 1 - f1_score(y_true, y_predict) - elif 'ap' in metric_name: - score = 1 - average_precision_score(y_true, y_predict) - else: - raise ValueError(metric_name+' is not a built-in metric, ' - 'currently built-in metrics are: ' - 'r2, rmse, mae, mse, accuracy, roc_auc, log_loss, f1, ap. ' - 'please pass a customized metric function to AutoML.fit(metric=func)') - return score - - -def get_y_pred(estimator, X, eval_metric, obj): - if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj: - y_pred_classes = estimator.predict_proba(X) - y_pred = y_pred_classes[:, - 1] if y_pred_classes.ndim>1 else y_pred_classes - elif eval_metric in ['log_loss', 'roc_auc']: - y_pred = estimator.predict_proba(X) - else: - try: - y_pred = estimator.predict(X) - except: - y_pred = np.ones(X.shape[0]) - return y_pred - - -def get_test_loss(estimator, X_train, y_train, X_test, y_test, eval_metric, obj, - labels=None, budget=None, train_loss=False): - start = time.time() - train_time = estimator.fit(X_train, y_train, budget) - if isinstance(eval_metric, str): - test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj) - test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test, - labels) - if train_loss != False: - test_pred_y = get_y_pred(estimator, X_train, eval_metric, obj) - train_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, - y_train, labels) - else: # customized metric function - test_loss, train_loss = eval_metric( - X_test, y_test, estimator, labels, X_train, y_train) - train_time = time.time()-start - return test_loss, train_time, train_loss - - -def train_model(estimator, X_train, y_train, budget): - train_time = estimator.fit(X_train, y_train, budget) - return train_time - - -def evaluate_model(estimator, X_train, y_train, X_val, y_val, budget, kf, - objective_name, eval_method, eval_metric, best_val_loss, train_loss=False): - if 'holdout' in eval_method: - val_loss, train_loss, train_time = evaluate_model_holdout( - estimator, X_train, y_train, X_val, y_val, budget, - objective_name, eval_metric, best_val_loss, train_loss=train_loss) - else: - val_loss, train_loss, train_time = evaluate_model_CV( - estimator, X_train, y_train, budget, kf, objective_name, - eval_metric, best_val_loss, train_loss=train_loss) - return val_loss, train_loss, train_time - - -def evaluate_model_holdout(estimator, X_train, y_train, X_val, y_val, budget, - objective_name, eval_metric, best_val_loss, train_loss=False): - val_loss, train_time, train_loss = get_test_loss( - estimator, X_train, y_train, X_val, y_val, eval_metric, objective_name, - budget = budget, train_loss=train_loss) - return val_loss, train_loss, train_time - - -def evaluate_model_CV(estimator, X_train_all, y_train_all, budget, kf, - objective_name, eval_metric, best_val_loss, train_loss=False): - start_time = time.time() - total_val_loss = total_train_loss = 0 - train_time = 0 - valid_fold_num = 0 - n = kf.get_n_splits() - X_train_split, y_train_split = X_train_all, y_train_all - if objective_name=='regression': - labels = None - else: - labels = np.unique(y_train_all) - - if isinstance(kf, RepeatedStratifiedKFold): - kf = kf.split(X_train_split, y_train_split) - else: - kf = kf.split(X_train_split) - rng = np.random.RandomState(2020) - val_loss_list = [] - budget_per_train = budget / (n+1) - for train_index, val_index in kf: - train_index = rng.permutation(train_index) - if isinstance(X_train_all, pd.DataFrame): - X_train, X_val = X_train_split.iloc[ - train_index], X_train_split.iloc[val_index] - else: - X_train, X_val = X_train_split[ - train_index], X_train_split[val_index] - if isinstance(y_train_all, pd.Series): - y_train, y_val = y_train_split.iloc[ - train_index], y_train_split.iloc[val_index] - else: - y_train, y_val = y_train_split[ - train_index], y_train_split[val_index] - estimator.cleanup() - val_loss_i, train_time_i, train_loss_i = get_test_loss( - estimator, X_train, y_train, X_val, y_val, eval_metric, - objective_name, labels, budget_per_train, train_loss=train_loss) - valid_fold_num += 1 - total_val_loss += val_loss_i - if train_loss != False: - if total_train_loss != 0: total_train_loss += train_loss_i - else: total_train_loss = train_loss_i - train_time += train_time_i - if valid_fold_num == n: - val_loss_list.append(total_val_loss/valid_fold_num) - total_val_loss = valid_fold_num = 0 - elif time.time() - start_time >= budget: - val_loss_list.append(total_val_loss/valid_fold_num) - break - val_loss = np.max(val_loss_list) - if train_loss != False: train_loss = total_train_loss/n - budget -= time.time() - start_time - if val_loss < best_val_loss and budget > budget_per_train: - estimator.cleanup() - train_time_full = estimator.fit(X_train_all, y_train_all, budget) - train_time += train_time_full - return val_loss, train_loss, train_time - - -def compute_estimator(X_train, y_train, X_val, y_val, budget, kf, - config_dic, objective_name, estimator_name, eval_method, eval_metric, - best_val_loss = np.Inf, n_jobs=1, estimator_class=None, train_loss=False): - start_time = time.time() - estimator_class = estimator_class or get_estimator_class( - objective_name, estimator_name) - estimator = estimator_class( - **config_dic, objective_name = objective_name, n_jobs=n_jobs) - val_loss, train_loss, train_time = evaluate_model( - estimator, X_train, y_train, X_val, y_val, budget, kf, objective_name, - eval_method, eval_metric, best_val_loss, train_loss=train_loss) - all_time = time.time() - start_time - return estimator, val_loss, train_loss, train_time, all_time - - -def train_estimator(X_train, y_train, config_dic, objective_name, - estimator_name, n_jobs=1, estimator_class=None, budget=None): - start_time = time.time() - estimator_class = estimator_class or get_estimator_class(objective_name, - estimator_name) - estimator = estimator_class(**config_dic, objective_name = objective_name, - n_jobs=n_jobs) - if X_train is not None: - train_time = train_model(estimator, X_train, y_train, budget) - else: - estimator = estimator.estimator_class(**estimator.params) - train_time = time.time() - start_time - return estimator, train_time - - -def get_classification_objective(num_labels: int) -> str: - if num_labels == 2: - objective_name = 'binary:logistic' - else: - objective_name = 'multi:softmax' - return objective_name - - +'''! + * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. +''' + +from .model import * +import time +from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \ + accuracy_score, mean_absolute_error, log_loss, average_precision_score, \ + f1_score +import numpy as np +from sklearn.model_selection import RepeatedStratifiedKFold + +import logging +logger = logging.getLogger(__name__) + + +def get_estimator_class(task, estimator_name): + ''' when adding a new learner, need to add an elif branch ''' + + + if 'xgboost' in estimator_name: + if 'regression' in task: + estimator_class = XGBoostEstimator + else: + estimator_class = XGBoostSklearnEstimator + elif 'rf' in estimator_name: + estimator_class = RandomForestEstimator + elif 'lgbm' in estimator_name: + estimator_class = LGBMEstimator + elif 'lrl1' in estimator_name: + estimator_class = LRL1Classifier + elif 'lrl2' in estimator_name: + estimator_class = LRL2Classifier + elif 'catboost' in estimator_name: + estimator_class = CatBoostEstimator + elif 'extra_tree' in estimator_name: + estimator_class = ExtraTreeEstimator + elif 'kneighbor' in estimator_name: + estimator_class = KNeighborsEstimator + else: + raise ValueError(estimator_name + ' is not a built-in learner. ' + 'Please use AutoML.add_learner() to add a customized learner.') + return estimator_class + + +def sklearn_metric_loss_score(metric_name, y_predict, y_true, labels=None, + sample_weight=None): + '''Loss using the specified metric + + Args: + metric_name: A string of the mtric name, one of + 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'log_loss', + 'f1', 'ap' + y_predict: A 1d or 2d numpy array of the predictions which can be + used to calculate the metric. E.g., 2d for log_loss and 1d + for others. + y_true: A 1d numpy array of the true labels + labels: A 1d numpy array of the unique labels + sample_weight: A 1d numpy array of the sample weight + + Returns: + score: A float number of the loss, the lower the better + ''' + metric_name = metric_name.lower() + if 'r2' in metric_name: + score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight) + elif metric_name == 'rmse': + score = np.sqrt(mean_squared_error(y_true, y_predict, + sample_weight=sample_weight)) + elif metric_name == 'mae': + score = mean_absolute_error(y_true, y_predict, + sample_weight=sample_weight) + elif metric_name == 'mse': + score = mean_squared_error(y_true, y_predict, + sample_weight=sample_weight) + elif metric_name == 'accuracy': + score = 1.0 - accuracy_score(y_true, y_predict, + sample_weight=sample_weight) + elif 'roc_auc' in metric_name: + score = 1.0 - roc_auc_score(y_true, y_predict, + sample_weight=sample_weight) + elif 'log_loss' in metric_name: + score = log_loss(y_true, y_predict, labels=labels, + sample_weight=sample_weight) + elif 'f1' in metric_name: + score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight) + elif 'ap' in metric_name: + score = 1 - average_precision_score(y_true, y_predict, + sample_weight=sample_weight) + else: + raise ValueError(metric_name+' is not a built-in metric, ' + 'currently built-in metrics are: ' + 'r2, rmse, mae, mse, accuracy, roc_auc, log_loss, f1, ap. ' + 'please pass a customized metric function to AutoML.fit(metric=func)') + return score + + +def get_y_pred(estimator, X, eval_metric, obj): + if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj: + y_pred_classes = estimator.predict_proba(X) + y_pred = y_pred_classes[:, + 1] if y_pred_classes.ndim>1 else y_pred_classes + elif eval_metric in ['log_loss', 'roc_auc']: + y_pred = estimator.predict_proba(X) + else: + try: + y_pred = estimator.predict(X) + except: + logger.debug("prediction failed. Using a constant predictor.") + y_pred = np.ones(X.shape[0]) + return y_pred + + +def get_test_loss(estimator, X_train, y_train, X_test, y_test, weight_test, + eval_metric, obj, labels=None, budget=None, train_loss=False, fit_kwargs={}): + start = time.time() + train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs) + if isinstance(eval_metric, str): + test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj) + test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test, + labels, weight_test) + if train_loss != False: + test_pred_y = get_y_pred(estimator, X_train, eval_metric, obj) + train_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, + y_train, labels, fit_kwargs.get('sample_weight')) + else: # customized metric function + test_loss, train_loss = eval_metric( + X_test, y_test, estimator, labels, X_train, y_train, + weight_test, fit_kwargs.get('sample_weight')) + train_time = time.time()-start + return test_loss, train_time, train_loss + + +def train_model(estimator, X_train, y_train, budget, fit_kwargs={}): + train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs) + return train_time + + +def evaluate_model(estimator, X_train, y_train, X_val, y_val, weight_val, + budget, kf, task, eval_method, eval_metric, best_val_loss, train_loss=False, + fit_kwargs={}): + if 'holdout' in eval_method: + val_loss, train_loss, train_time = evaluate_model_holdout( + estimator, X_train, y_train, X_val, y_val, weight_val, budget, + task, eval_metric, best_val_loss, train_loss=train_loss, + fit_kwargs=fit_kwargs) + else: + val_loss, train_loss, train_time = evaluate_model_CV( + estimator, X_train, y_train, budget, kf, task, + eval_metric, best_val_loss, train_loss=train_loss, + fit_kwargs=fit_kwargs) + return val_loss, train_loss, train_time + + +def evaluate_model_holdout(estimator, X_train, y_train, X_val, y_val, + weight_val, budget, task, eval_metric, best_val_loss, train_loss=False, + fit_kwargs={}): + val_loss, train_time, train_loss = get_test_loss( + estimator, X_train, y_train, X_val, y_val, weight_val, eval_metric, + task, budget = budget, train_loss=train_loss, fit_kwargs=fit_kwargs) + return val_loss, train_loss, train_time + + +def evaluate_model_CV(estimator, X_train_all, y_train_all, budget, kf, + task, eval_metric, best_val_loss, train_loss=False, fit_kwargs={}): + start_time = time.time() + total_val_loss = total_train_loss = 0 + train_time = 0 + valid_fold_num = 0 + n = kf.get_n_splits() + X_train_split, y_train_split = X_train_all, y_train_all + if task=='regression': + labels = None + else: + labels = np.unique(y_train_all) + + if isinstance(kf, RepeatedStratifiedKFold): + kf = kf.split(X_train_split, y_train_split) + else: + kf = kf.split(X_train_split) + rng = np.random.RandomState(2020) + val_loss_list = [] + budget_per_train = budget / (n+1) + if 'sample_weight' in fit_kwargs: + weight = fit_kwargs['sample_weight'] + weight_val = None + else: + weight = weight_val = None + for train_index, val_index in kf: + train_index = rng.permutation(train_index) + if isinstance(X_train_all, pd.DataFrame): + X_train, X_val = X_train_split.iloc[ + train_index], X_train_split.iloc[val_index] + else: + X_train, X_val = X_train_split[ + train_index], X_train_split[val_index] + if isinstance(y_train_all, pd.Series): + y_train, y_val = y_train_split.iloc[ + train_index], y_train_split.iloc[val_index] + else: + y_train, y_val = y_train_split[ + train_index], y_train_split[val_index] + estimator.cleanup() + if weight is not None: + fit_kwargs['sample_weight'], weight_val = weight[ + train_index], weight[val_index] + val_loss_i, train_time_i, train_loss_i = get_test_loss( + estimator, X_train, y_train, X_val, y_val, weight_val, + eval_metric, task, labels, budget_per_train, + train_loss=train_loss, fit_kwargs=fit_kwargs) + if weight is not None: + fit_kwargs['sample_weight'] = weight + valid_fold_num += 1 + total_val_loss += val_loss_i + if train_loss != False: + if total_train_loss != 0: total_train_loss += train_loss_i + else: total_train_loss = train_loss_i + train_time += train_time_i + if valid_fold_num == n: + val_loss_list.append(total_val_loss/valid_fold_num) + total_val_loss = valid_fold_num = 0 + elif time.time() - start_time >= budget: + val_loss_list.append(total_val_loss/valid_fold_num) + break + val_loss = np.max(val_loss_list) + if train_loss != False: train_loss = total_train_loss/n + budget -= time.time() - start_time + if val_loss < best_val_loss and budget > budget_per_train: + estimator.cleanup() + estimator.fit(X_train_all, y_train_all, budget, **fit_kwargs) + return val_loss, train_loss, train_time + + +def compute_estimator(X_train, y_train, X_val, y_val, weight_val, budget, kf, + config_dic, task, estimator_name, eval_method, eval_metric, + best_val_loss = np.Inf, n_jobs=1, estimator_class=None, train_loss=False, + fit_kwargs = {}): + start_time = time.time() + estimator_class = estimator_class or get_estimator_class( + task, estimator_name) + estimator = estimator_class( + **config_dic, task = task, n_jobs=n_jobs) + val_loss, train_loss, train_time = evaluate_model( + estimator, X_train, y_train, X_val, y_val, weight_val, budget, kf, task, + eval_method, eval_metric, best_val_loss, train_loss=train_loss, + fit_kwargs=fit_kwargs) + all_time = time.time() - start_time + return estimator, val_loss, train_loss, train_time, all_time + + +def train_estimator(X_train, y_train, config_dic, task, + estimator_name, n_jobs=1, estimator_class=None, budget=None, fit_kwargs={}): + start_time = time.time() + estimator_class = estimator_class or get_estimator_class(task, + estimator_name) + estimator = estimator_class(**config_dic, task = task, + n_jobs=n_jobs) + if X_train is not None: + train_time = train_model(estimator, X_train, y_train, budget, + **fit_kwargs) + else: + estimator = estimator.estimator_class(**estimator.params) + train_time = time.time() - start_time + return estimator, train_time + + +def get_classification_objective(num_labels: int) -> str: + if num_labels == 2: + objective_name = 'binary:logistic' + else: + objective_name = 'multi:softmax' + return objective_name diff --git a/flaml/model.py b/flaml/model.py index 327214585..1c2c664cd 100644 --- a/flaml/model.py +++ b/flaml/model.py @@ -1,515 +1,747 @@ -'''! - * Copyright (c) 2020 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. -''' - -import numpy as np -import xgboost as xgb -from xgboost import XGBClassifier, XGBRegressor -import time -from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier -from sklearn.linear_model import LogisticRegression -from lightgbm import LGBMClassifier, LGBMRegressor -import scipy.sparse -import pandas as pd - - -class BaseEstimator: - '''The abstract class for all learners - - Typical example: - XGBoostEstimator: for regression - XGBoostSklearnEstimator: for classification - LGBMEstimator, RandomForestEstimator, LRL1Classifier, LRL2Classifier: - for both regression and classification - ''' - - def __init__(self, objective_name = 'binary:logistic', - **params): - '''Constructor - - Args: - objective_name: A string of the objective name, one of - 'binary:logistic', 'multi:softmax', 'regression' - n_jobs: An integer of the number of parallel threads - params: A dictionary of the hyperparameter names and values - ''' - self.params = params - self.estimator_class = None - self.objective_name = objective_name - if '_estimator_type' in params: - self._estimator_type = params['_estimator_type'] - else: - self._estimator_type = "regressor" if objective_name=='regression' \ - else "classifier" - - def get_params(self, deep=False): - params = self.params.copy() - params["objective_name"] = self.objective_name - if hasattr(self, '_estimator_type'): - params['_estimator_type'] = self._estimator_type - return params - - @property - def classes_(self): - return self.model.classes_ - - def preprocess(self, X): - return X - - def _fit(self, X_train, y_train): - - curent_time = time.time() - X_train = self.preprocess(X_train) - model = self.estimator_class(**self.params) - model.fit(X_train, y_train) - train_time = time.time() - curent_time - self.model = model - return train_time - - def fit(self, X_train, y_train, budget=None): - '''Train the model from given training data - - Args: - X_train: A numpy array of training data in shape n*m - y_train: A numpy array of labels in shape n*1 - budget: A float of the time budget in seconds - - Returns: - train_time: A float of the training time in seconds - ''' - return self._fit(X_train, y_train) - - def predict(self, X_test): - '''Predict label from features - - Args: - X_test: A numpy array of featurized instances, shape n*m - - Returns: - A numpy array of shape n*1. - Each element is the label for a instance - ''' - X_test = self.preprocess(X_test) - return self.model.predict(X_test) - - def predict_proba(self, X_test): - '''Predict the probability of each class from features - - Only works for classification problems - - Args: - model: An object of trained model with method predict_proba() - X_test: A numpy array of featurized instances, shape n*m - - Returns: - A numpy array of shape n*c. c is the # classes - Each element at (i,j) is the probability for instance i to be in - class j - ''' - if 'regression' in self.objective_name: - print('Regression tasks do not support predict_prob') - raise ValueError - else: - X_test = self.preprocess(X_test) - return self.model.predict_proba(X_test) - - def cleanup(self): pass - - -class SKLearnEstimator(BaseEstimator): - - - def preprocess(self, X): - if isinstance(X, pd.DataFrame): - X = X.copy() - cat_columns = X.select_dtypes(include=['category']).columns - X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes) - return X - - -class LGBMEstimator(BaseEstimator): - - - def __init__(self, objective_name='binary:logistic', n_jobs=1, - n_estimators=2, max_leaves=2, min_child_weight=1e-3, learning_rate=0.1, - subsample=1.0, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0, - colsample_bytree=1.0, log_max_bin=8, **params): - super().__init__(objective_name, **params) - # Default: ‘regression’ for LGBMRegressor, - # ‘binary’ or ‘multiclass’ for LGBMClassifier - if 'regression' in objective_name: - final_objective_name = 'regression' - elif 'binary' in objective_name: - final_objective_name = 'binary' - elif 'multi' in objective_name: - final_objective_name = 'multiclass' - else: - final_objective_name = 'regression' - self.params = { - "n_estimators": int(round(n_estimators)), - "num_leaves": params[ - 'num_leaves'] if 'num_leaves' in params else int( - round(max_leaves)), - 'objective': params[ - "objective"] if "objective" in params else final_objective_name, - 'n_jobs': n_jobs, - 'learning_rate': float(learning_rate), - 'reg_alpha': float(reg_alpha), - 'reg_lambda': float(reg_lambda), - 'min_child_weight': float(min_child_weight), - 'colsample_bytree':float(colsample_bytree), - 'subsample': float(subsample), - } - self.params['max_bin'] = params['max_bin'] if 'max_bin' in params else ( - 1<4) and budget is not None: - self.params["n_estimators"] = 1 - self.t1 = self._fit(X_train, y_train) - if self.t1 >= budget: - self.params["n_estimators"] = n_iter - return self.t1 - self.params["n_estimators"] = 4 - self.t2 = self._fit(X_train, y_train) - self.time_per_iter = (self.t2 - self.t1)/( - self.params["n_estimators"]-1) if self.t2 > self.t1 \ - else self.t1 if self.t1 else 0.001 - self.train_size = X_train.shape[0] - if self.t1+self.t2>=budget or n_iter==self.params["n_estimators"]: - self.params["n_estimators"] = n_iter - return time.time() - start_time - if budget is not None: - self.params["n_estimators"] = min(n_iter, int((budget-time.time()+ - start_time-self.t1)/self.time_per_iter+1)) - if self.params["n_estimators"] > 0: - self._fit(X_train, y_train) - self.params["n_estimators"] = n_iter - train_time = time.time() - start_time - return train_time - - -class XGBoostEstimator(SKLearnEstimator): - ''' not using sklearn API, used for regression ''' - - - def __init__(self, objective_name='regression', all_thread=False, n_jobs=1, - n_estimators=4, max_leaves=4, subsample=1.0, min_child_weight=1, - learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0, - colsample_bytree=1.0, tree_method='auto', **params): - super().__init__(objective_name, **params) - self.n_estimators = int(round(n_estimators)) - self.max_leaves = int(round(max_leaves)) - self.grids = [] - self.params = { - 'max_leaves': int(round(max_leaves)), - 'max_depth': 0, - 'grow_policy': params[ - "grow_policy"] if "grow_policy" in params else 'lossguide', - 'tree_method':tree_method, - 'verbosity': 0, - 'nthread':n_jobs, - 'learning_rate': float(learning_rate), - 'subsample': float(subsample), - 'reg_alpha': float(reg_alpha), - 'reg_lambda': float(reg_lambda), - 'min_child_weight': float(min_child_weight), - 'booster': params['booster'] if 'booster' in params else 'gbtree', - 'colsample_bylevel': float(colsample_bylevel), - 'colsample_bytree':float(colsample_bytree), - } - if all_thread: - del self.params['nthread'] - - def get_params(self, deep=False): - params = super().get_params() - params["n_jobs"] = params['nthread'] - return params - - def fit(self, X_train, y_train, budget=None): - curent_time = time.time() - if not scipy.sparse.issparse(X_train): - self.params['tree_method'] = 'hist' - X_train = self.preprocess(X_train) - dtrain = xgb.DMatrix(X_train, label=y_train) - if self.max_leaves>0: - xgb_model = xgb.train(self.params, dtrain, self.n_estimators) - del dtrain - train_time = time.time() - curent_time - self.model = xgb_model - return train_time - else: - return None - - def predict(self, X_test): - if not scipy.sparse.issparse(X_test): - X_test = self.preprocess(X_test) - dtest = xgb.DMatrix(X_test) - return super().predict(dtest) - - -class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator): - ''' using sklearn API, used for classification ''' - - - def __init__(self, objective_name='binary:logistic', n_jobs=1, - n_estimators=4, max_leaves=4, subsample=1.0, - min_child_weight=1, learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0, - colsample_bylevel=1.0, colsample_bytree=1.0, tree_method='hist', - **params): - super().__init__(objective_name, **params) - self.params = { - "n_estimators": int(round(n_estimators)), - 'max_leaves': int(round(max_leaves)), - 'max_depth': 0, - 'grow_policy': params[ - "grow_policy"] if "grow_policy" in params else 'lossguide', - 'tree_method':tree_method, - 'verbosity': 0, - 'n_jobs': n_jobs, - 'learning_rate': float(learning_rate), - 'subsample': float(subsample), - 'reg_alpha': float(reg_alpha), - 'reg_lambda': float(reg_lambda), - 'min_child_weight': float(min_child_weight), - 'booster': params['booster'] if 'booster' in params else 'gbtree', - 'colsample_bylevel': float(colsample_bylevel), - 'colsample_bytree': float(colsample_bytree), - } - - if 'regression' in objective_name: - self.estimator_class = XGBRegressor - else: - self.estimator_class = XGBClassifier - self.time_per_iter = None - self.train_size = 0 - - def fit(self, X_train, y_train, budget=None): - if scipy.sparse.issparse(X_train): - self.params['tree_method'] = 'auto' - return super().fit(X_train, y_train, budget) - - -class RandomForestEstimator(SKLearnEstimator, LGBMEstimator): - - - def __init__(self, objective_name = 'binary:logistic', n_jobs = 1, - n_estimators = 4, max_leaves = 4, max_features = 1.0, - min_samples_split = 2, min_samples_leaf = 1, criterion = 1, **params): - super().__init__(objective_name, **params) - self.params = { - "n_estimators": int(round(n_estimators)), - "n_jobs": n_jobs, - 'max_features': float(max_features), - } - if 'regression' in objective_name: - self.estimator_class = RandomForestRegressor - else: - self.estimator_class = RandomForestClassifier - self.params['criterion'] = 'entropy' if criterion>1.5 else 'gini' - self.time_per_iter = None - self.train_size = 0 - - def get_params(self, deep=False): - params = super().get_params() - params["criterion"] = 1 if params["criterion"]=='gini' else 2 - return params - - -class ExtraTreeEstimator(RandomForestEstimator): - - - def __init__(self, objective_name = 'binary:logistic', n_jobs = 1, - n_estimators = 4, max_leaves = 4, max_features = 1.0, - min_samples_split = 2, min_samples_leaf = 1, criterion = 1, **params): - super().__init__(objective_name, **params) - self.params = { - "n_estimators": int(round(n_estimators)), - "n_jobs": n_jobs, - 'max_features': float(max_features), - } - if 'regression' in objective_name: - from sklearn.ensemble import ExtraTreesRegressor - self.estimator_class = ExtraTreesRegressor - else: - from sklearn.ensemble import ExtraTreesClassifier - self.estimator_class = ExtraTreesClassifier - self.params['criterion'] = 'entropy' if criterion>1.5 else 'gini' - self.time_per_iter = None - self.train_size = 0 - - -class LRL1Classifier(SKLearnEstimator): - - - def __init__(self, tol=0.0001, C=1.0, - objective_name='binary:logistic', n_jobs=1, **params): - super().__init__(objective_name, **params) - self.params = { - 'penalty': 'l1', - 'tol': float(tol), - 'C': float(C), - 'solver': 'saga', - 'n_jobs': n_jobs, - } - if 'regression' in objective_name: - self.estimator_class = None - print('Does not support regression task') - raise NotImplementedError - else: - self.estimator_class = LogisticRegression - - -class LRL2Classifier(SKLearnEstimator): - - - def __init__(self, tol=0.0001, C=1.0, - objective_name='binary:logistic', n_jobs=1, **params): - super().__init__(objective_name, **params) - self.params = { - 'penalty': 'l2', - 'tol': float(tol), - 'C': float(C), - 'solver': 'lbfgs', - 'n_jobs': n_jobs, - } - if 'regression' in objective_name: - self.estimator_class = None - print('Does not support regression task') - raise NotImplementedError - else: - self.estimator_class = LogisticRegression - - -class CatBoostEstimator(BaseEstimator): - - - time_per_iter = None - train_size = 0 - - def __init__(self, objective_name = 'binary:logistic', n_jobs=1, - n_estimators=8192, exp_max_depth=64, learning_rate=0.1, rounds=4, - l2_leaf_reg=3, **params): - super().__init__(objective_name, **params) - self.params = { - "early_stopping_rounds": int(round(rounds)), - "n_estimators": n_estimators, - 'learning_rate': learning_rate, - 'thread_count': n_jobs, - 'verbose': False, - 'random_seed': params[ - "random_seed"] if "random_seed" in params else 10242048, - } - # print(n_estimators) - if 'regression' in objective_name: - from catboost import CatBoostRegressor - self.estimator_class = CatBoostRegressor - else: - from catboost import CatBoostClassifier - self.estimator_class = CatBoostClassifier - - def get_params(self, deep=False): - params = super().get_params() - params['n_jobs'] = params['thread_count'] - params['rounds'] = params['early_stopping_rounds'] - return params - - def fit(self, X_train, y_train, budget=None): - start_time = time.time() - n_iter = self.params["n_estimators"] - if isinstance(X_train, pd.DataFrame): - cat_features = list(X_train.select_dtypes( - include='category').columns) - else: - cat_features = [] - if (not CatBoostEstimator.time_per_iter or - abs(CatBoostEstimator.train_size-len(y_train))>4) and budget: - # measure the time per iteration - self.params["n_estimators"] = 1 - CatBoostEstimator.model = self.estimator_class(**self.params) - CatBoostEstimator.model.fit(X_train, y_train, - cat_features=cat_features) - CatBoostEstimator.t1 = time.time() - start_time - if CatBoostEstimator.t1 >= budget: - self.params["n_estimators"] = n_iter - self.model = CatBoostEstimator.model - return CatBoostEstimator.t1 - self.params["n_estimators"] = 4 - CatBoostEstimator.model = self.estimator_class(**self.params) - CatBoostEstimator.model.fit(X_train, y_train, - cat_features=cat_features) - CatBoostEstimator.time_per_iter = (time.time() - start_time - - CatBoostEstimator.t1)/(self.params["n_estimators"]-1) - if CatBoostEstimator.time_per_iter <= 0: - CatBoostEstimator.time_per_iter = CatBoostEstimator.t1 - CatBoostEstimator.train_size = len(y_train) - if time.time()-start_time>=budget or n_iter==self.params[ - "n_estimators"]: - self.params["n_estimators"] = n_iter - self.model = CatBoostEstimator.model - return time.time()-start_time - if budget: - train_times = 1 - self.params["n_estimators"] = min(n_iter, int((budget-time.time()+ - start_time-CatBoostEstimator.t1)/train_times/ - CatBoostEstimator.time_per_iter+1)) - self.model = CatBoostEstimator.model - if self.params["n_estimators"] > 0: - l = max(int(len(y_train)*0.9), len(y_train)-1000) - X_tr, y_tr = X_train[:l], y_train[:l] - from catboost import Pool - model = self.estimator_class(**self.params) - model.fit(X_tr, y_tr, cat_features=cat_features, eval_set=Pool( - data=X_train[l:], label=y_train[l:], cat_features=cat_features)) - # print(self.params["n_estimators"], model.get_best_iteration()) - self.model = model - self.params["n_estimators"] = n_iter - train_time = time.time() - start_time - # print(budget, train_time) - return train_time - - -class KNeighborsEstimator(BaseEstimator): - - - def __init__(self, objective_name='binary:logistic', n_jobs=1, - n_neighbors=5, **params): - super().__init__(objective_name, **params) - self.params= { - 'n_neighbors': int(round(n_neighbors)), - 'weights': 'distance', - 'n_jobs': n_jobs, - } - if 'regression' in objective_name: - from sklearn.neighbors import KNeighborsRegressor - self.estimator_class = KNeighborsRegressor - else: - from sklearn.neighbors import KNeighborsClassifier - self.estimator_class = KNeighborsClassifier - - def preprocess(self, X): - if isinstance(X, pd.DataFrame): - cat_columns = X.select_dtypes(['category']).columns - # print(X.dtypes) - # print(cat_columns) - if X.shape[1] == len(cat_columns): - raise ValueError( - "kneighbor requires at least one numeric feature") - X = X.drop(cat_columns, axis=1) - return X +'''! + * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. +''' + +import numpy as np +import xgboost as xgb +import time +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier +from sklearn.linear_model import LogisticRegression +from lightgbm import LGBMClassifier, LGBMRegressor +from scipy.sparse import issparse +import pandas as pd +from . import tune + +import logging +logger = logging.getLogger(__name__) + + +class BaseEstimator: + '''The abstract class for all learners + + Typical example: + XGBoostEstimator: for regression + XGBoostSklearnEstimator: for classification + LGBMEstimator, RandomForestEstimator, LRL1Classifier, LRL2Classifier: + for both regression and classification + ''' + + def __init__(self, task = 'binary:logistic', **params): + '''Constructor + + Args: + task: A string of the task type, one of + 'binary:logistic', 'multi:softmax', 'regression' + n_jobs: An integer of the number of parallel threads + params: A dictionary of the hyperparameter names and values + ''' + self.params = params + self.estimator_class = self._model = None + self._task = task + if '_estimator_type' in params: + self._estimator_type = params['_estimator_type'] + else: + self._estimator_type = "regressor" if task=='regression' \ + else "classifier" + + def get_params(self, deep=False): + params = self.params.copy() + params["task"] = self._task + if hasattr(self, '_estimator_type'): + params['_estimator_type'] = self._estimator_type + return params + + @property + def classes_(self): + return self._model.classes_ + + @property + def n_features_in_(self): + return self.model.n_features_in_ + + @property + def model(self): + '''Trained model after fit() is called, or None before fit() is called + ''' + return self._model + + def _preprocess(self, X): + return X + + def _fit(self, X_train, y_train, **kwargs): + + curent_time = time.time() + X_train = self._preprocess(X_train) + model = self.estimator_class(**self.params) + model.fit(X_train, y_train, **kwargs) + train_time = time.time() - curent_time + self._model = model + return train_time + + def fit(self, X_train, y_train, budget=None, **kwargs): + '''Train the model from given training data + + Args: + X_train: A numpy array of training data in shape n*m + y_train: A numpy array of labels in shape n*1 + budget: A float of the time budget in seconds + + Returns: + train_time: A float of the training time in seconds + ''' + return self._fit(X_train, y_train, **kwargs) + + def predict(self, X_test): + '''Predict label from features + + Args: + X_test: A numpy array of featurized instances, shape n*m + + Returns: + A numpy array of shape n*1. + Each element is the label for a instance + ''' + X_test = self._preprocess(X_test) + return self._model.predict(X_test) + + def predict_proba(self, X_test): + '''Predict the probability of each class from features + + Only works for classification problems + + Args: + model: An object of trained model with method predict_proba() + X_test: A numpy array of featurized instances, shape n*m + + Returns: + A numpy array of shape n*c. c is the # classes + Each element at (i,j) is the probability for instance i to be in + class j + ''' + if 'regression' in self._task: + print('Regression tasks do not support predict_prob') + raise ValueError + else: + X_test = self._preprocess(X_test) + return self._model.predict_proba(X_test) + + def cleanup(self): pass + + @classmethod + def search_space(cls, **params): + '''[required method] search space + + Returns: + A dictionary of the search space. + Each key is the name of a hyperparameter, and value is a dict with + its domain and init_value (optional), cat_hp_cost (optional) + e.g., + {'domain': tune.randint(lower=1, upper=10), 'init_value': 1} + ''' + return {} + + @classmethod + def size(cls, config): + '''[optional method] memory size of the estimator in bytes + + Args: + config - the dict of the hyperparameter config + + Returns: + A float of the memory size required by the estimator to train the + given config + ''' + return 1.0 + + @classmethod + def cost_relative2lgbm(cls): + '''[optional method] relative cost compared to lightgbm''' + return 1.0 + + +class SKLearnEstimator(BaseEstimator): + + + def _preprocess(self, X): + if isinstance(X, pd.DataFrame): + X = X.copy() + cat_columns = X.select_dtypes(include=['category']).columns + X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes) + return X + + +class LGBMEstimator(BaseEstimator): + + + @classmethod + def search_space(cls, data_size, **params): + upper = min(32768,int(data_size)) + return { + 'n_estimators': { + 'domain': tune.qloguniform(lower=4, upper=upper, q=1), + 'init_value': 4, + }, + 'max_leaves': { + 'domain': tune.qloguniform(lower=4, upper=upper, q=1), + 'init_value': 4, + }, + 'min_child_weight': { + 'domain': tune.loguniform(lower=0.001, upper=20.0), + 'init_value': 20.0, + }, + 'learning_rate': { + 'domain': tune.loguniform(lower=0.01, upper=1.0), + 'init_value': 0.1, + }, + 'subsample': { + 'domain': tune.uniform(lower=0.6, upper=1.0), + 'init_value': 1.0, + }, + 'log_max_bin': { + 'domain': tune.qloguniform(lower=3, upper=10, q=1), + 'init_value': 8, + }, + 'colsample_bytree': { + 'domain': tune.uniform(lower=0.7, upper=1.0), + 'init_value': 1.0, + }, + 'reg_alpha': { + 'domain': tune.loguniform(lower=1e-10, upper=1.0), + 'init_value': 1e-10, + }, + 'reg_lambda': { + 'domain': tune.loguniform(lower=1e-10, upper=1.0), + 'init_value': 1.0, + }, + } + + @classmethod + def size(cls, config): + max_leaves = int(round(config['max_leaves'])) + n_estimators = int(round(config['n_estimators'])) + return (max_leaves*3 + (max_leaves-1)*4 + 1.0)*n_estimators*8 + + def __init__(self, task='binary:logistic', n_jobs=1, + n_estimators=2, max_leaves=2, min_child_weight=1e-3, learning_rate=0.1, + subsample=1.0, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0, + colsample_bytree=1.0, log_max_bin=8, **params): + super().__init__(task, **params) + # Default: ‘regression’ for LGBMRegressor, + # ‘binary’ or ‘multiclass’ for LGBMClassifier + if 'regression' in task: + objective = 'regression' + elif 'binary' in task: + objective = 'binary' + elif 'multi' in task: + objective = 'multiclass' + else: objective = 'regression' + self.params = { + "n_estimators": int(round(n_estimators)), + "num_leaves": params[ + 'num_leaves'] if 'num_leaves' in params else int( + round(max_leaves)), + 'objective': params[ + "objective"] if "objective" in params else objective, + 'n_jobs': n_jobs, + 'learning_rate': float(learning_rate), + 'reg_alpha': float(reg_alpha), + 'reg_lambda': float(reg_lambda), + 'min_child_weight': float(min_child_weight), + 'colsample_bytree':float(colsample_bytree), + 'subsample': float(subsample), + } + self.params['max_bin'] = params['max_bin'] if 'max_bin' in params else ( + 1<4) and budget is not None: + self.params["n_estimators"] = 1 + self._t1 = self._fit(X_train, y_train, **kwargs) + if self._t1 >= budget: + self.params["n_estimators"] = n_iter + return self._t1 + self.params["n_estimators"] = 4 + self._t2 = self._fit(X_train, y_train, **kwargs) + self._time_per_iter = (self._t2 - self._t1)/( + self.params["n_estimators"]-1) if self._t2 > self._t1 \ + else self._t1 if self._t1 else 0.001 + self._train_size = X_train.shape[0] + if self._t1+self._t2>=budget or n_iter==self.params["n_estimators"]: + self.params["n_estimators"] = n_iter + return time.time() - start_time + if budget is not None: + self.params["n_estimators"] = min(n_iter, int((budget-time.time()+ + start_time-self._t1)/self._time_per_iter+1)) + if self.params["n_estimators"] > 0: + self._fit(X_train, y_train, **kwargs) + self.params["n_estimators"] = n_iter + train_time = time.time() - start_time + return train_time + + +class XGBoostEstimator(SKLearnEstimator): + ''' not using sklearn API, used for regression ''' + + + @classmethod + def search_space(cls, data_size, **params): + upper = min(32768,int(data_size)) + return { + 'n_estimators': { + 'domain': tune.qloguniform(lower=4, upper=upper, q=1), + 'init_value': 4, + }, + 'max_leaves': { + 'domain': tune.qloguniform(lower=4, upper=upper, q=1), + 'init_value': 4, + }, + 'min_child_weight': { + 'domain': tune.loguniform(lower=0.001, upper=20.0), + 'init_value': 20.0, + }, + 'learning_rate': { + 'domain': tune.loguniform(lower=0.01, upper=1.0), + 'init_value': 0.1, + }, + 'subsample': { + 'domain': tune.uniform(lower=0.6, upper=1.0), + 'init_value': 1.0, + }, + 'colsample_bylevel': { + 'domain': tune.uniform(lower=0.6, upper=1.0), + 'init_value': 1.0, + }, + 'colsample_bytree': { + 'domain': tune.uniform(lower=0.7, upper=1.0), + 'init_value': 1.0, + }, + 'reg_alpha': { + 'domain': tune.loguniform(lower=1e-10, upper=1.0), + 'init_value': 1e-10, + }, + 'reg_lambda': { + 'domain': tune.loguniform(lower=1e-10, upper=1.0), + 'init_value': 1.0, + }, + } + + @classmethod + def size(cls, config): + return LGBMEstimator.size(config) + + @classmethod + def cost_relative2lgbm(cls): + return 1.6 + + def __init__(self, task='regression', all_thread=False, n_jobs=1, + n_estimators=4, max_leaves=4, subsample=1.0, min_child_weight=1, + learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0, + colsample_bytree=1.0, tree_method='auto', **params): + super().__init__(task, **params) + self._n_estimators = int(round(n_estimators)) + self._max_leaves = int(round(max_leaves)) + self.params = { + 'max_leaves': int(round(max_leaves)), + 'max_depth': 0, + 'grow_policy': params[ + "grow_policy"] if "grow_policy" in params else 'lossguide', + 'tree_method':tree_method, + 'verbosity': 0, + 'nthread':n_jobs, + 'learning_rate': float(learning_rate), + 'subsample': float(subsample), + 'reg_alpha': float(reg_alpha), + 'reg_lambda': float(reg_lambda), + 'min_child_weight': float(min_child_weight), + 'booster': params['booster'] if 'booster' in params else 'gbtree', + 'colsample_bylevel': float(colsample_bylevel), + 'colsample_bytree':float(colsample_bytree), + } + if all_thread: + del self.params['nthread'] + + def get_params(self, deep=False): + params = super().get_params() + params["n_jobs"] = params['nthread'] + return params + + def fit(self, X_train, y_train, budget=None, **kwargs): + start_time = time.time() + if not issparse(X_train): + self.params['tree_method'] = 'hist' + X_train = self._preprocess(X_train) + dtrain = xgb.DMatrix(X_train, label=y_train) + if self._max_leaves>0: + if 'sample_weight' in kwargs: + self._model = xgb.train(self.params, dtrain, + self._n_estimators, weight=kwargs['sample_weight']) + else: + self._model = xgb.train(self.params, dtrain, self._n_estimators) + del dtrain + train_time = time.time() - start_time + return train_time + else: + return None + + def predict(self, X_test): + if not issparse(X_test): + X_test = self._preprocess(X_test) + dtest = xgb.DMatrix(X_test) + return super().predict(dtest) + + +class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator): + ''' using sklearn API, used for classification ''' + + + @classmethod + def search_space(cls, data_size, **params): + return XGBoostEstimator.search_space(data_size) + + @classmethod + def cost_relative2lgbm(cls): + return XGBoostEstimator.cost_relative2lgbm() + + def __init__(self, task='binary:logistic', n_jobs=1, + n_estimators=4, max_leaves=4, subsample=1.0, + min_child_weight=1, learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0, + colsample_bylevel=1.0, colsample_bytree=1.0, tree_method='hist', + **params): + super().__init__(task, **params) + self.params = { + "n_estimators": int(round(n_estimators)), + 'max_leaves': int(round(max_leaves)), + 'max_depth': 0, + 'grow_policy': params[ + "grow_policy"] if "grow_policy" in params else 'lossguide', + 'tree_method':tree_method, + 'verbosity': 0, + 'n_jobs': n_jobs, + 'learning_rate': float(learning_rate), + 'subsample': float(subsample), + 'reg_alpha': float(reg_alpha), + 'reg_lambda': float(reg_lambda), + 'min_child_weight': float(min_child_weight), + 'booster': params['booster'] if 'booster' in params else 'gbtree', + 'colsample_bylevel': float(colsample_bylevel), + 'colsample_bytree': float(colsample_bytree), + } + + if 'regression' in task: + self.estimator_class = xgb.XGBRegressor + else: + self.estimator_class = xgb.XGBClassifier + self._time_per_iter = None + self._train_size = 0 + + def fit(self, X_train, y_train, budget=None, **kwargs): + if issparse(X_train): + self.params['tree_method'] = 'auto' + return super().fit(X_train, y_train, budget, **kwargs) + + +class RandomForestEstimator(SKLearnEstimator, LGBMEstimator): + + + @classmethod + def search_space(cls, data_size, task, **params): + upper = min(2048, int(data_size)) + space = { + 'n_estimators': { + 'domain': tune.qloguniform(lower=4, upper=upper, q=1), + 'init_value': 4, + }, + 'max_features': { + 'domain': tune.loguniform(lower=0.1, upper=1.0), + 'init_value': 1.0, + }, + } + if task != 'regression': + space['criterion'] = { + 'domain': tune.choice(['gini', 'entropy']), + # 'init_value': 'gini', + } + return space + + @classmethod + def size(cls, config): + return 1.0 + + @classmethod + def cost_relative2lgbm(cls): + return 2.0 + + def __init__(self, task = 'binary:logistic', n_jobs = 1, + n_estimators = 4, max_features = 1.0, criterion = 'gini', **params): + super().__init__(task, **params) + self.params = { + "n_estimators": int(round(n_estimators)), + "n_jobs": n_jobs, + 'max_features': float(max_features), + } + if 'regression' in task: + self.estimator_class = RandomForestRegressor + else: + self.estimator_class = RandomForestClassifier + self.params['criterion'] = criterion + self._time_per_iter = None + self._train_size = 0 + + def get_params(self, deep=False): + params = super().get_params() + params["criterion"] = 1 if params["criterion"]=='gini' else 2 + return params + + +class ExtraTreeEstimator(RandomForestEstimator): + + + @classmethod + def cost_relative2lgbm(cls): + return 1.9 + + def __init__(self, task = 'binary:logistic', **params): + super().__init__(task, **params) + if 'regression' in task: + self.estimator_class = ExtraTreesRegressor + else: + self.estimator_class = ExtraTreesClassifier + + +class LRL1Classifier(SKLearnEstimator): + + + @classmethod + def search_space(cls, **params): + return { + 'C': { + 'domain': tune.loguniform(lower=0.03125, upper=32768.0), + 'init_value': 1.0, + }, + } + + @classmethod + def cost_relative2lgbm(cls): + return 160 + + def __init__(self, task='binary:logistic', n_jobs=1, tol=0.0001, C=1.0, + **params): + super().__init__(task, **params) + self.params = { + 'penalty': 'l1', + 'tol': float(tol), + 'C': float(C), + 'solver': 'saga', + 'n_jobs': n_jobs, + } + if 'regression' in task: + self.estimator_class = None + print('LR does not support regression task') + raise NotImplementedError + else: + self.estimator_class = LogisticRegression + + +class LRL2Classifier(SKLearnEstimator): + + + @classmethod + def search_space(cls, **params): + return LRL1Classifier.search_space(**params) + + @classmethod + def cost_relative2lgbm(cls): + return 25 + + def __init__(self, task='binary:logistic', n_jobs=1, tol=0.0001, C=1.0, + **params): + super().__init__(task, **params) + self.params = { + 'penalty': 'l2', + 'tol': float(tol), + 'C': float(C), + 'solver': 'lbfgs', + 'n_jobs': n_jobs, + } + if 'regression' in task: + self.estimator_class = None + print('LR does not support regression task') + raise NotImplementedError + else: + self.estimator_class = LogisticRegression + + +class CatBoostEstimator(BaseEstimator): + + + _time_per_iter = None + _train_size = 0 + + @classmethod + def search_space(cls, data_size, **params): + upper = max(min(round(1500000/data_size),150), 11) + return { + 'early_stopping_rounds': { + 'domain': tune.qloguniform(lower=10, upper=upper, q=1), + 'init_value': 10, + }, + 'learning_rate': { + 'domain': tune.loguniform(lower=.005, upper=.2), + 'init_value': 0.1, + }, + } + + @classmethod + def size(cls, config): + n_estimators = 8192 + max_leaves = 64 + return (max_leaves*3 + (max_leaves-1)*4 + 1.0)*n_estimators*8 + + @classmethod + def cost_relative2lgbm(cls): + return 15 + + def __init__(self, task = 'binary:logistic', n_jobs=1, + n_estimators=8192, learning_rate=0.1, early_stopping_rounds=4, **params): + super().__init__(task, **params) + self.params = { + "early_stopping_rounds": int(round(early_stopping_rounds)), + "n_estimators": n_estimators, + 'learning_rate': learning_rate, + 'thread_count': n_jobs, + 'verbose': False, + 'random_seed': params[ + "random_seed"] if "random_seed" in params else 10242048, + } + if 'regression' in task: + from catboost import CatBoostRegressor + self.estimator_class = CatBoostRegressor + else: + from catboost import CatBoostClassifier + self.estimator_class = CatBoostClassifier + + def get_params(self, deep=False): + params = super().get_params() + params['n_jobs'] = params['thread_count'] + return params + + def fit(self, X_train, y_train, budget=None, **kwargs): + start_time = time.time() + n_iter = self.params["n_estimators"] + if isinstance(X_train, pd.DataFrame): + cat_features = list(X_train.select_dtypes( + include='category').columns) + else: + cat_features = [] + if (not CatBoostEstimator._time_per_iter or + abs(CatBoostEstimator._train_size-len(y_train))>4) and budget: + # measure the time per iteration + self.params["n_estimators"] = 1 + CatBoostEstimator._smallmodel = self.estimator_class(**self.params) + CatBoostEstimator._smallmodel.fit(X_train, y_train, + cat_features=cat_features, **kwargs) + CatBoostEstimator._t1 = time.time() - start_time + if CatBoostEstimator._t1 >= budget: + self.params["n_estimators"] = n_iter + self._model = CatBoostEstimator._smallmodel + return CatBoostEstimator._t1 + self.params["n_estimators"] = 4 + CatBoostEstimator._smallmodel = self.estimator_class(**self.params) + CatBoostEstimator._smallmodel.fit(X_train, y_train, + cat_features=cat_features, **kwargs) + CatBoostEstimator._time_per_iter = (time.time() - start_time - + CatBoostEstimator._t1)/(self.params["n_estimators"]-1) + if CatBoostEstimator._time_per_iter <= 0: + CatBoostEstimator._time_per_iter = CatBoostEstimator._t1 + CatBoostEstimator._train_size = len(y_train) + if time.time()-start_time>=budget or n_iter==self.params[ + "n_estimators"]: + self.params["n_estimators"] = n_iter + self._model = CatBoostEstimator._smallmodel + return time.time()-start_time + if budget: + train_times = 1 + self.params["n_estimators"] = min(n_iter, int((budget-time.time()+ + start_time-CatBoostEstimator._t1)/train_times/ + CatBoostEstimator._time_per_iter+1)) + self._model = CatBoostEstimator._smallmodel + if self.params["n_estimators"] > 0: + l = max(int(len(y_train)*0.9), len(y_train)-1000) + X_tr, y_tr = X_train[:l], y_train[:l] + if 'sample_weight' in kwargs: + weight = kwargs['sample_weight'] + if weight is not None: kwargs['sample_weight'] = weight[:l] + else: weight = None + from catboost import Pool + model = self.estimator_class(**self.params) + model.fit(X_tr, y_tr, cat_features=cat_features, eval_set=Pool( + data=X_train[l:], label=y_train[l:], cat_features=cat_features), + **kwargs) + if weight is not None: kwargs['sample_weight'] = weight + # print(self.params["n_estimators"], model.get_best_iteration()) + self._model = model + self.params["n_estimators"] = n_iter + train_time = time.time() - start_time + # print(budget, train_time) + return train_time + + +class KNeighborsEstimator(BaseEstimator): + + + @classmethod + def search_space(cls, data_size, **params): + upper = min(512, int(data_size/2)) + return { + 'n_neighbors': { + 'domain': tune.qloguniform(lower=1, upper=upper, q=1), + 'init_value': 5, + }, + } + + @classmethod + def cost_relative2lgbm(cls): + return 30 + + def __init__(self, task='binary:logistic', n_jobs=1, + n_neighbors=5, **params): + super().__init__(task, **params) + self.params= { + 'n_neighbors': int(round(n_neighbors)), + 'weights': 'distance', + 'n_jobs': n_jobs, + } + if 'regression' in task: + from sklearn.neighbors import KNeighborsRegressor + self.estimator_class = KNeighborsRegressor + else: + from sklearn.neighbors import KNeighborsClassifier + self.estimator_class = KNeighborsClassifier + + def _preprocess(self, X): + if isinstance(X, pd.DataFrame): + cat_columns = X.select_dtypes(['category']).columns + # print(X.dtypes) + # print(cat_columns) + if X.shape[1] == len(cat_columns): + raise ValueError( + "kneighbor requires at least one numeric feature") + X = X.drop(cat_columns, axis=1) + return X diff --git a/flaml/search.py b/flaml/search.py deleted file mode 100644 index 5c90ad7af..000000000 --- a/flaml/search.py +++ /dev/null @@ -1,675 +0,0 @@ -'''! - * Copyright (c) 2020 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. -''' - -from functools import partial -from .ml import train_estimator -import time -import math -import numpy as np -from .space import config_space, estimator_size, get_config_values, \ - generate_config_ini, generate_config_max, generate_config_min -from .config import SPLIT_RATIO, MIN_SAMPLE_TRAIN, \ - HISTORY_SIZE, MEM_THRES, BASE_Const, BASE_LOWER_BOUND -from random import gauss - - -def rand_vector_unit_sphere(dims): - vec = [gauss(0, 1) for i in range(dims)] - mag = sum(x**2 for x in vec) ** .5 - return [x / mag for x in vec] - - -def rand_vector_gaussian(dims): - vec = [gauss(0, 1) for i in range(dims)] - return vec - - -class ParamSearch: - ''' - the class for searching params for 1 learner - ''' - - def __init__(self, estimator, data_size, - compute_with_config, train_with_config, save_info_helper=None, - init_sample_size=MIN_SAMPLE_TRAIN, objective_name='regression', - log_type='better', config_space_info=None, size_estimator=None, - split_ratio=SPLIT_RATIO, base_change='sqrtK', use_dual_dir=True, - move_type='geo'): - self.log_type = log_type - self.base_change = base_change - if init_sample_size > data_size: - init_sample_size = data_size - self.next_sample_size = {} - self.prev_sample_size = {} - s = init_sample_size - self.prev_sample_size[s] = s - self.estimator_configspace = config_space_info or config_space( - estimator, data_size, objective_name) - self.get_size_for_config = size_estimator or ( - lambda x: estimator_size(x, estimator)) - config_min_dic_primary, config_min_dic_more, config_min_dic = \ - generate_config_min(estimator, self.estimator_configspace, None) - self.min_config_primary = np.array( - list(config_min_dic_primary.values())) - self.min_config_more = np.array(list(config_min_dic_more.values())) - self.min_config = np.array(list(config_min_dic.values())) - # init configurations for different sample size - config_init_dic_primary, config_init_dic_more, _, config_type_dic = \ - generate_config_ini(estimator, self.estimator_configspace) - self.init_config_dic_primary = {s: config_init_dic_primary} - self.init_config_dic_more = {s: config_init_dic_more} - self.init_config_dic_type_dic = {'primary': { - s: config_init_dic_primary}, 'more': {s: config_init_dic_more}} - self.init_config_dic = { - **self.init_config_dic_type_dic['primary'], - **self.init_config_dic_type_dic['more'] - } - self.config_type_dic = config_type_dic - # max configurations for different sample size - config_max_dic_primary, config_max_dic_more, config_max_dic = \ - generate_config_max( - estimator, self.estimator_configspace, int(s)) - self.max_config_dic_primary = {s: np.array( - list(config_max_dic_primary.values()))} - self.max_config_dic_more = {s: np.array( - list(config_max_dic_more.values()))} - self.max_config_dic = {s: np.array(list(config_max_dic.values()))} - self.dims = (len(self.min_config_primary), len(self.min_config_more)) - # print(self.dims) - if self.dims[1] > 0 and self.dims[0] > 0: - self.base_upper_bound = { - s: - max( - max( - (self.max_config_dic_primary[s][i] / self.min_config_primary[i]) - ** math.sqrt(self.dims[0]) for i in range(self.dims[0]) - ), - max( - (self.max_config_dic_more[s][i] / self.min_config_more[i]) - ** math.sqrt(self.dims[1]) for i in range(self.dims[1])) - ) - } - elif self.dims[0] > 0: - self.base_upper_bound = { - s: - max( - (self.max_config_dic_primary[s][i] / self.min_config_primary[i]) - ** (math.sqrt(self.dims[0])) for i in range(self.dims[0]) - ) - } - else: - self.base_upper_bound = { - s: - max( - (self.max_config_dic_more[s][i] / self.min_config_more[i]) - ** (math.sqrt(self.dims[1])) for i in range(self.dims[1]) - ) - } - - # create sample size sequence - while s < data_size: - s2 = self.next_sample_size[s] = s * 2 if s * 2 <= data_size else data_size - self.prev_sample_size[s2] = s - s = s2 - - config_max_dic_primary, config_max_dic_more, config_max_dic = \ - generate_config_max( - estimator, self.estimator_configspace, int(s)) - self.max_config_dic_primary[s] = np.array( - list(config_max_dic_primary.values())) - self.max_config_dic_more[s] = np.array( - list(config_max_dic_more.values())) - self.max_config_dic[s] = np.array(list(config_max_dic.values())) - if self.dims[1] > 0 and self.dims[0] > 0: - self.base_upper_bound[s] = max( - max( - (self.max_config_dic_primary[s][i] - / self.min_config_primary[i]) - ** math.sqrt(self.dims[0]) for i in range(self.dims[0]) - ), - max( - (self.max_config_dic_more[s][i] - / self.min_config_more[i]) - ** math.sqrt(self.dims[1]) for i in range(self.dims[1]) - ) - ) - elif self.dims[0] > 0: - self.base_upper_bound[s] = max( - (self.max_config_dic_primary[s][i] - / self.min_config_primary[i]) - ** math.sqrt(self.dims[0]) for i in range(self.dims[0]) - ) - else: - self.base_upper_bound[s] = max( - (self.max_config_dic_more[s][i] / self.min_config_more[i]) - ** math.sqrt(self.dims[1]) for i in range(self.dims[1]) - ) - - self.init_sample_size = init_sample_size - self.data_size = data_size - self.sample_size_full = int(self.data_size / (1.0 - split_ratio)) - - self.compute_with_config = compute_with_config - self.estimator = estimator - - # for logging - self.save_helper = save_info_helper - self.estimator_type_list = ['primary', 'more'] - self.dim = self.dims[0] if self.dims[0] > 0 else self.dims[1] - self.b = BASE_Const**(math.sqrt(self.dim)) - self.base_ini = self.b - self.total_dim = sum(self.dims) - - self.epo = 2**(self.dim - 1) - # keys are [sample size, config], values are (loss, train_time) - self.config_tried = {} - self.train_with_config = train_with_config - - self.current_config_loss = None - self.use_dual_dir = use_dual_dir - self.move_type = move_type - - def evaluate_config(self, config, sample_size, move='_pos'): - ''' - evaluate a configuration, update search state, - and return whether the state is changed - ''' - if self.time_from_start >= self.time_budget or move != '_ini' and \ - self.train_time > self.time_budget - self.time_from_start: - return False - - model, val_loss, new_train_time, from_history, train_loss = \ - self.evaluate_proposed_config(config, sample_size, move) - # update current config - self.update_current_config(config, val_loss, sample_size) - # update best model statistics, including statistics about loss and time - improved = self.update_search_state_best( - config, sample_size, model, val_loss, new_train_time, from_history) - self.time_from_start = time.time() - self.start_time - if self.save_helper is not None: - if from_history: - move = move + '_from_hist' - self.save_helper.append(self.model_count, - train_loss, - new_train_time, - self.time_from_start, - val_loss, - config, - self.best_loss, - self.best_config[0], - self.estimator, - sample_size) - return improved - - def get_hist_config_sig(self, sample_size, config): - config_values = get_config_values(config, self.config_type_dic) - config_sig = str(sample_size) + '_' + str(config_values) - return config_sig - - def evaluate_proposed_config(self, config, sample_size, move): - self.model_count += 1 - config_sig = self.get_hist_config_sig(sample_size, config) - d = self.total_dim - history_size_per_d = len(self.config_tried) / float(d) - if config_sig in self.config_tried: - val_loss, new_train_time = self.config_tried[config_sig] - # print(config_sig,'found in history') - model = train_loss = None - from_history = True - else: - model, val_loss, train_loss, new_train_time, _ = \ - self.compute_with_config(self.estimator, config, sample_size) - from_history = False - if history_size_per_d < HISTORY_SIZE: - self.config_tried[config_sig] = (val_loss, new_train_time) - - if self.first_move: - self.init_config_dic[sample_size] = config - move = '_ini' - self.base = self.base_ini - self.num_noimprovement = 0 - move = str(self.estimator) + move - return model, val_loss, new_train_time, from_history, train_loss - - def update_current_config(self, config, val_loss, sample_size): - if self.first_move or val_loss < self.current_config_loss: - self.first_move = False - # update current config and coressponding sample_size - self.sample_size = sample_size - self.config = config - self.config_primary = {x: config[x] - for x in self.config_primary.keys()} - try: - self.config_more = {x: config[x] - for x in self.config_more.keys()} - except: - self.config_more = {} - self.current_config_loss = val_loss - - def update_reset_best_config_loss(self, sample_size, config, val_loss): - if sample_size == self.data_size: - if self.best_config_loss_dic_full_reset[1] is None: - self.best_config_loss_dic_full_reset = [ - config, val_loss, self.model_count] - else: - full_reset_best_loss = self.best_config_loss_dic_full_reset[1] - if val_loss < full_reset_best_loss: - self.best_config_loss_dic_full_reset = [ - config, full_reset_best_loss, self.model_count] - - def update_search_state_best(self, config, sample_size, model, val_loss, - new_train_time, from_history): - # upate the loss statistics for a particular sample size - if sample_size not in self.best_config_loss_samplesize_dic: - self.best_config_loss_samplesize_dic[sample_size] = [ - config, val_loss, self.model_count] - else: - s_best_loss = self.best_config_loss_samplesize_dic[sample_size][1] - if val_loss < s_best_loss: - self.best_config_loss_samplesize_dic[sample_size] = [ - config, val_loss, self.model_count] - - self.update_reset_best_config_loss(sample_size, config, val_loss) - - # update best model statistics, including statistics about loss and time - if val_loss < self.new_loss: - self.old_loss = self.new_loss if self.new_loss < float( - 'inf') else 2 * val_loss - self.new_loss = val_loss - self.old_loss_time = self.new_loss_time - self.old_train_time = self.train_time - self.new_loss_time = self.train_time = new_train_time - if val_loss < self.best_loss: - self.best_config = [self.config, self.model_count] - if not from_history: - self.trained_estimator = model - # print(model) - else: - print(val_loss, self.best_loss) - self.best_loss = val_loss - self.time_best_found = self.time_from_start - return True - else: - if not from_history: - self.new_loss_time += new_train_time - return False - - def get_proposal(self, current_config, rand_vector_func, base, move_type): - rand_vector = rand_vector_func(len(current_config)) - rand_vector = [i for i in rand_vector] - rand_vector_neg = [-i for i in rand_vector] - - move_vector = {} - move_vector_neg = {} - - index_ = 0 - for k, v in current_config.items(): - if 'geo' in move_type: - # get the move vector using the proposed random vector - move_vector[k] = v * (base**(rand_vector[index_])) - move_vector_neg[k] = v * (base**(rand_vector_neg[index_])) - else: - move_vector[k] = v + (base * (rand_vector[index_])) - move_vector_neg[k] = v + (base * (rand_vector_neg[index_])) - index_ += 1 - - # as long as one of the proposed model (+ or -) is within the mem_limit - # we will proceed - if not self.use_dual_dir: - move_vector_neg = None - return move_vector, move_vector_neg - - def get_config_from_move_vector(self, v, estimator_type): - if v != None: - if 'all' in estimator_type: - v = v - elif 'primary' in estimator_type: - v = {**v, **self.config_more} - else: - v = {**self.config_primary, **v} - - bounded_v = self.get_v_within_min_max(v) - else: - bounded_v = None - return bounded_v - - def dual_direction_sample(self, base, current_search_config, - estimator_type='primary', rand_vector_func=rand_vector_unit_sphere, - mem_thres=MEM_THRES, move_type='geo'): - current_config = current_search_config - if len(current_config) == 0: - return None, None - bounded_v_list = [None, None] - while not bounded_v_list[0] and not bounded_v_list[ - 1] and self.time_from_start < self.time_budget: - move_vector, move_vector_neg = self.get_proposal( - current_config, rand_vector_func, - base, move_type) - bounded_v_list = [move_vector, move_vector_neg] - for i, v in enumerate(bounded_v_list): - bounded_v = self.get_config_from_move_vector(v, estimator_type) - proposed_model_size = self.get_size_for_config(bounded_v) - proposed_model_size = 0 if not isinstance( - proposed_model_size, float) else proposed_model_size - if proposed_model_size > mem_thres: - # print(bounded_v, proposed_model_size, mem_thres) - bounded_v = None - bounded_v_list[i] = bounded_v - self.time_from_start = time.time() - self.start_time - return bounded_v_list - - def get_v_within_min_max(self, v): - index_ = 0 - bounded_v = {} - for key, value in v.items(): - new_value = min(max( - value, self.min_config[index_]), self.max_config_dic[ - self.sample_size][index_]) - bounded_v[key] = new_value - index_ += 1 - return bounded_v - - def expected_time_improvement_search(self): - return max(self.old_loss_time - self.old_train_time + self.train_time, - self.new_loss_time) - - def increase_sample_size(self): - ''' - whether it's time to increase sample size - ''' - expected_time_improvement_sample = 2 * self.train_time - self.increase = self.sample_size < self.data_size and ( - self.estimator_type == 0 or self.dims[0] == 0) and ( - not self.improved - or expected_time_improvement_sample - < self.expected_time_improvement_search() - ) - return self.increase - - def search_begin(self, time_budget, start_time=None): - self.time_budget = time_budget - if not start_time: - self.start_time = time.time() - else: - self.start_time = start_time - # the time to train the last selected config - self.old_train_time = self.train_time = 0 - self.time_from_start = 0 - # search states - self.first_move = True - self.improved = True - self.estimator_type = 0 if self.dims[0] > 0 else 1 - - self.old_loss = self.new_loss = self.best_loss = float('+inf') - # new_loss_time is the time from the beginning of training self.config to - # now, - # old_loss_time is the time from the beginning of training the old - # self.config to the beginning of training self.config - self.old_loss_time = self.new_loss_time = 0 - - self.trained_estimator = None - self.model_count = 0 - self.K = 0 - self.old_modelcount = 0 - - # self.config has two parts: config_primary contain the configs - # that are related with model complexity, config_more contains the - # configs that is not related with model complexity - self.config_primary = self.init_config_dic_primary[self.init_sample_size] - self.config_more = self.init_config_dic_more[self.init_sample_size] - self.config = {**self.config_primary, **self.config_more} - self.best_config = [None, None] - # key: sample size, value: [best_config, best_loss, model_count] under - # sample size in the key - self.best_config_loss_samplesize_dic = { - self.init_sample_size: [self.config, self.old_loss, self.model_count]} - # key: sample size, value: [best_config, best_loss, model_count] under - # sample size in the key - self.best_config_loss_dic_full_reset = [None, None, None] - self.sample_size = self.init_sample_size - self.base_change_bound = 1 - self.base_change_count = 0 - self.evaluate_config(self.config, self.sample_size, '_ini') - self.increase = False - - def train_config(self, config, sample_size): - ''' - train a configuration - ''' - # print('Evalute Config') - if self.time_from_start >= self.time_budget: - return False - config_sig = self.get_hist_config_sig(sample_size, config) - if not config_sig in self.config_tried: - _, new_train_time = self.train_with_config( - self.estimator, config, sample_size) - train_loss, val_loss, move = None, self.new_loss, str( - self.estimator) + '_trainAll' - self.time_from_start = time.time() - self.start_time - if self.save_helper is not None: - self.save_helper.append(self.model_count, - train_loss, - new_train_time, - self.time_from_start, - val_loss, - config, - self.best_loss, - self.best_config, - move, - sample_size) - self.config_tried[config_sig] = (val_loss, new_train_time) - - def try_increase_sample_size(self): - # print( self.estimator, self.sample_size) - if self.sample_size in self.next_sample_size: - if self.increase_sample_size(): - self.first_move = True - self.improved = True - self.estimator_type = 0 if self.dims[0] > 0 else 1 - self.evaluate_config( - self.config, self.next_sample_size[self.sample_size]) - if not self.old_modelcount and self.sample_size == self.data_size: - self.old_modelcount = self.model_count - - def setup_current_search_config(self): - estimator_type = self.estimator_type_list[self.estimator_type] - if 'all' in estimator_type: - current_search_config = self.config - elif 'primary' in estimator_type: - current_search_config = self.config_primary - else: - current_search_config = self.config_more - # print(self.config_more) - return estimator_type, current_search_config - - def search1step(self, global_best_loss=float('+inf'), - retrain_full=True, mem_thres=MEM_THRES, reset_type='init_gaussian'): - # try to increase sample size - self.try_increase_sample_size() - # decide current_search_config according to estimator_type - estimator_type, current_search_config = \ - self.setup_current_search_config() - time_left = self.time_budget - self.time_from_start - if time_left < self.train_time: - return False - if retrain_full and self.train_time < time_left < 2 * self.train_time \ - and self.best_loss <= global_best_loss: - self.train_config(self.best_config[0], self.sample_size_full) - - move_vector, move_vector_neg = self.dual_direction_sample( - self.base, current_search_config, estimator_type, - rand_vector_unit_sphere, mem_thres, self.move_type) - if move_vector is None: - if move_vector_neg is None: - self.improved = False - else: - self.improved = self.evaluate_config( - move_vector_neg, self.sample_size, '_neg' + str( - estimator_type)) - else: - self.improved = self.evaluate_config( - move_vector, self.sample_size, '_pos' + str(estimator_type)) - if not self.improved: - if move_vector_neg is None: - pass - else: - self.improved = self.evaluate_config( - move_vector_neg, self.sample_size, '_neg' + str( - estimator_type)) - self.update_noimprovement_stat( - global_best_loss, retrain_full, reset_type) - return self.improved - - def update_noimprovement_stat(self, global_best_loss, retrain_full, - reset_type): - if self.improved: - self.num_noimprovement = 0 - else: - self.estimator_type = 1 - self.estimator_type - if self.dims[self.estimator_type] == 0: - self.estimator_type = 1 - self.estimator_type - if self.estimator_type == 1 or self.dims[1] == 0: - self.noimprovement(global_best_loss, retrain_full, reset_type) - - def noimprovement(self, global_best_loss, retrain_full, reset_type='org'): - if self.sample_size == self.data_size: - # Do not wait until full sample size to update num_noimprovement? - self.num_noimprovement += 1 - if self.num_noimprovement >= self.epo: - self.num_noimprovement = 0 - # print(self.num_noimprovement, self.epo) - if self.base_change == 'squareroot': - self.base = math.sqrt(self.base) - else: - if self.K == 0: # first time - oldK = self.best_config_loss_dic_full_reset[2] - \ - self.old_modelcount - else: - oldK = self.K - self.K = self.model_count + 1 - self.old_modelcount - if self.base_change == 'K': - self.base **= oldK / self.K - else: - self.base **= math.sqrt(oldK / self.K) - if self.dims[1] > 0 and self.dims[0] > 0: - base_lower_bound = min( - min( - (1.0 + self.estimator_configspace[i].min_change - / self.config_primary[i]) - ** math.sqrt(self.dims[0]) - for i in self.config_primary.keys() - ), - min( - (1.0 + self.estimator_configspace[i].min_change - / self.config_more[i]) - ** math.sqrt(self.dims[1]) - for i in self.config_more.keys() - ) - ) - elif self.dims[0] > 0: - base_lower_bound = min( - (1.0 + self.estimator_configspace[i].min_change - / self.config_primary[i]) - ** math.sqrt(self.dims[0]) - for i in self.config_primary.keys() - ) - else: - base_lower_bound = min( - (1.0 + self.estimator_configspace[i].min_change - / self.config_more[i]) - ** math.sqrt(self.dims[1]) - for i in self.config_more.keys() - ) - if np.isinf(base_lower_bound): - base_lower_bound = BASE_LOWER_BOUND - self.base_change_count += 1 - if self.base <= base_lower_bound or \ - self.base_change_count == self.base_change_bound: - if retrain_full and self.sample_size == self.data_size: - if self.best_loss <= global_best_loss: - # Only train on full data when the curent estimator - # is the best estimator - # print('best estimator and train on full data') - self.train_config( - self.best_config[0], self.sample_size_full) - # remaining time is more than enough for another trial - if self.time_budget - self.time_from_start > self.train_time: - self.base_change_bound <<= 1 - self.base_change_count = 0 - self.K = 0 - self.old_modelcount = self.model_count - self.best_config_loss_dic_full_reset = [None, None, - None] - self.first_move = True - self.improved = True - self.base_ini = min( - self.base_ini * 2, self.base_upper_bound[ - self.sample_size]) - self.estimator_type = 0 if self.dims[0] > 0 else 1 - reset_config, reset_sample_size = self.get_reset_config( - self.init_sample_size, reset_type) - self.sample_size = reset_sample_size - # print('reset sample size', reset_sample_size) - self.evaluate_config(reset_config, self.sample_size, - '_ini') - - def get_reset_config(self, sample_size, reset_type): - init_config = self.init_config_dic[self.sample_size] - reset_sample_size = sample_size - if 'org' in reset_type: - reset_config = init_config - else: - if 'init_gaussian' in reset_type: - reset_config = init_config - reset_sample_size = self.get_reset_sample_size(reset_config) - config_values = get_config_values( - reset_config, self.config_type_dic) - config_sig = str(reset_sample_size) + '_' + str(config_values) - count = 0 - while config_sig in self.config_tried and \ - self.time_from_start < self.time_budget and count < 1000: - # TODO: check exhaustiveness? use time as condition? - count += 1 - move, move_neg = self.dual_direction_sample( - base=self.b, current_search_config=init_config, - estimator_type='all', - rand_vector_func=rand_vector_gaussian, - move_type=self.move_type) - if move: - reset_config = move_neg - elif move_neg: - reset_config = move_neg - else: - continue - reset_sample_size = self.get_reset_sample_size( - reset_config) - config_values = get_config_values( - reset_config, self.config_type_dic) - config_sig = str(reset_sample_size) + \ - '_' + str(config_values) - self.time_from_start = time.time() - self.start_time - else: - raise NotImplementedError - return reset_config, reset_sample_size - - def get_reset_sample_size(self, reset_config): - if not reset_config: - print('reset_config is none') - reset_config_size = self.get_size_for_config(reset_config) - - candidate_sample_size_list = [] - for sample_size, config_and_bestloss in \ - self.best_config_loss_samplesize_dic.items(): - s_best_config = config_and_bestloss[0] - if not s_best_config: - print('best config is none', sample_size) - s_best_config_model_size = self.get_size_for_config(s_best_config) - if s_best_config_model_size >= reset_config_size: - candidate_sample_size_list.append(sample_size) - - if len(candidate_sample_size_list) != 0: - return min(candidate_sample_size_list) - else: - return self.data_size diff --git a/flaml/searcher/__init__.py b/flaml/searcher/__init__.py new file mode 100644 index 000000000..10a518a92 --- /dev/null +++ b/flaml/searcher/__init__.py @@ -0,0 +1,2 @@ +from .blendsearch import CFO, BlendSearch +from .flow2 import FLOW2 \ No newline at end of file diff --git a/flaml/searcher/blendsearch.py b/flaml/searcher/blendsearch.py new file mode 100644 index 000000000..97d12bd7b --- /dev/null +++ b/flaml/searcher/blendsearch.py @@ -0,0 +1,419 @@ +'''! + * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the + * project root for license information. +''' +from typing import Dict, Optional, List, Tuple +import numpy as np +import time +import pickle +try: + from ray.tune.suggest import Searcher + from ray.tune.suggest.optuna import OptunaSearch as GlobalSearch + from ray.tune.suggest.variant_generator import generate_variants +except ImportError: + from .suggestion import Searcher, OptunaSearch as GlobalSearch + from .variant_generator import generate_variants +from .search_thread import SearchThread +from .flow2 import FLOW2 as LocalSearch + +import logging +logger = logging.getLogger(__name__) + + +class BlendSearch(Searcher): + '''class for BlendSearch algorithm + ''' + + def __init__(self, + metric: Optional[str] = None, + mode: Optional[str] = None, + space: Optional[dict] = None, + points_to_evaluate: Optional[List[Dict]] = None, + cat_hp_cost: Optional[dict] = None, + prune_attr: Optional[str] = None, + min_resource: Optional[float] = None, + max_resource: Optional[float] = None, + reduction_factor: Optional[float] = None, + resources_per_trial: Optional[dict] = None, + global_search_alg: Optional[Searcher] = None, + mem_size = None): + '''Constructor + + Args: + metric: A string of the metric name to optimize for. + minimization or maximization. + mode: A string in ['min', 'max'] to specify the objective as + space: A dictionary to specify the search space. + points_to_evaluate: Initial parameter suggestions to be run first. + The first element needs to be a dictionary from a subset of + controlled dimensions to the initial low-cost values. + e.g., + + .. code-block:: python + + [{'epochs': 1}] + + cat_hp_cost: A dictionary from a subset of categorical dimensions + to the relative cost of each choice. + e.g., + + .. code-block:: python + + {'tree_method': [1, 1, 2]} + + i.e., the relative cost of the + three choices of 'tree_method' is 1, 1 and 2 respectively. + prune_attr: A string of the attribute used for pruning. + Not necessarily in space. + When prune_attr is in space, it is a hyperparameter, e.g., + 'n_iters', and the best value is unknown. + When prune_attr is not in space, it is a resource dimension, + e.g., 'sample_size', and the peak performance is assumed + to be at the max_resource. + min_resource: A float of the minimal resource to use for the + prune_attr; only valid if prune_attr is not in space. + max_resource: A float of the maximal resource to use for the + prune_attr; only valid if prune_attr is not in space. + reduction_factor: A float of the reduction factor used for + incremental pruning. + resources_per_trial: A dictionary of the resources permitted per + trial, such as 'mem'. + global_search_alg: A Searcher instance as the global search + instance. If omitted, Optuna is used. The following algos have + known issues when used as global_search_alg: + - HyperOptSearch raises exception sometimes + - TuneBOHB has its own scheduler + mem_size: A function to estimate the memory size for a given config. + ''' + self._metric, self._mode = metric, mode + if points_to_evaluate: init_config = points_to_evaluate[0] + else: init_config = {} + self._points_to_evaluate = points_to_evaluate + if global_search_alg is not None: + self._gs = global_search_alg + elif getattr(self, '__name__', None) != 'CFO': + self._gs = GlobalSearch(space=space, metric=metric, mode=mode) + else: + self._gs = None + self._ls = LocalSearch(init_config, metric, mode, cat_hp_cost, space, + prune_attr, min_resource, max_resource, reduction_factor) + self._resources_per_trial = resources_per_trial + self._mem_size = mem_size + self._mem_threshold = resources_per_trial.get( + 'mem') if resources_per_trial else None + self._init_search() + + def set_search_properties(self, + metric: Optional[str] = None, + mode: Optional[str] = None, + config: Optional[Dict] = None) -> bool: + if self._ls.space: + if 'time_budget_s' in config: + self._deadline = config.get('time_budget_s') + time.time() + if 'metric_target' in config: + self._metric_target = config.get('metric_target') + else: + self._metric, self._mode = metric, mode + self._ls.set_search_properties(metric, mode, config) + self._gs.set_search_properties(metric, mode, config) + self._init_search() + return True + + def _init_search(self): + '''initialize the search + ''' + self._metric_target = np.inf * self._ls.metric_op + self._search_thread_pool = { + # id: int -> thread: SearchThread + 0: SearchThread(self._ls.mode, self._gs) + } + self._thread_count = 1 # total # threads created + self._init_used = self._ls.init_config is None + self._trial_proposed_by = {} # trial_id: str -> thread_id: int + self._admissible_min = self._ls.normalize(self._ls.init_config) + self._admissible_max = self._admissible_min.copy() + self._result = {} # config_signature: tuple -> result: Dict + self._deadline = np.inf + + def save(self, checkpoint_path: str): + save_object = (self._metric_target, self._search_thread_pool, + self._thread_count, self._init_used, self._trial_proposed_by, + self._admissible_min, self._admissible_max, self._result, + self._deadline) + with open(checkpoint_path, "wb") as outputFile: + pickle.dump(save_object, outputFile) + + def restore(self, checkpoint_path: str): + with open(checkpoint_path, "rb") as inputFile: + save_object = pickle.load(inputFile) + self._metric_target, self._search_thread_pool, \ + self._thread_count, self._init_used, self._trial_proposed_by, \ + self._admissible_min, self._admissible_max, self._result, \ + self._deadline = save_object + + def restore_from_dir(self, checkpoint_dir: str): + super.restore_from_dir(checkpoint_dir) + + def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None, + error: bool = False): + ''' search thread updater and cleaner + ''' + thread_id = self._trial_proposed_by.get(trial_id) + if thread_id in self._search_thread_pool: + self._search_thread_pool[thread_id].on_trial_complete( + trial_id, result, error) + del self._trial_proposed_by[trial_id] + # if not thread_id: logger.info(f"result {result}") + if result: + config = {} + for key, value in result.items(): + if key.startswith('config/'): + config[key[7:]] = value + if error: # remove from result cache + del self._result[self._ls.config_signature(config)] + else: # add to result cache + self._result[self._ls.config_signature(config)] = result + # update target metric if improved + if (result[self._metric]-self._metric_target)*self._ls.metric_op<0: + self._metric_target = result[self._metric] + if thread_id: # from local search + # update admissible region + normalized_config = self._ls.normalize(config) + for key in self._admissible_min: + value = normalized_config[key] + if value > self._admissible_max[key]: + self._admissible_max[key] = value + elif value < self._admissible_min[key]: + self._admissible_min[key] = value + elif self._create_condition(result): + # thread creator + self._search_thread_pool[self._thread_count] = SearchThread( + self._ls.mode, + self._ls.create(config, result[self._metric], cost=result[ + "time_total_s"]) + ) + thread_id = self._thread_count + self._thread_count += 1 + + # cleaner + # logger.info(f"thread {thread_id} in search thread pool=" + # f"{thread_id in self._search_thread_pool}") + if thread_id and thread_id in self._search_thread_pool: + # local search thread + self._clean(thread_id) + + def _create_condition(self, result: Dict) -> bool: + ''' create thread condition + ''' + if len(self._search_thread_pool) < 2: return True + obj_median = np.median([thread.obj_best1 for id, thread in + self._search_thread_pool.items() if id]) + return result[self._metric] * self._ls.metric_op < obj_median + + def _clean(self, thread_id: int): + ''' delete thread and increase admissible region if converged, + merge local threads if they are close + ''' + assert thread_id + todelete = set() + for id in self._search_thread_pool: + if id and id!=thread_id: + if self._inferior(id, thread_id): + todelete.add(id) + for id in self._search_thread_pool: + if id and id!=thread_id: + if self._inferior(thread_id, id): + todelete.add(thread_id) + break + # logger.info(f"thead {thread_id}.converged=" + # f"{self._search_thread_pool[thread_id].converged}") + if self._search_thread_pool[thread_id].converged: + todelete.add(thread_id) + for key in self._admissible_min: + self._admissible_max[key] += self._ls.STEPSIZE + self._admissible_min[key] -= self._ls.STEPSIZE + for id in todelete: + del self._search_thread_pool[id] + + def _inferior(self, id1: int, id2: int) -> bool: + ''' whether thread id1 is inferior to id2 + ''' + t1 = self._search_thread_pool[id1] + t2 = self._search_thread_pool[id2] + if t1.obj_best1 < t2.obj_best2: return False + elif t1.resource and t1.resource < t2.resource: return False + elif t2.reach(t1): return True + else: return False + + def on_trial_result(self, trial_id: str, result: Dict): + if trial_id not in self._trial_proposed_by: return + thread_id = self._trial_proposed_by[trial_id] + if not thread_id in self._search_thread_pool: return + self._search_thread_pool[thread_id].on_trial_result(trial_id, result) + + def suggest(self, trial_id: str) -> Optional[Dict]: + ''' choose thread, suggest a valid config + ''' + if self._init_used and not self._points_to_evaluate: + choice, backup = self._select_thread() + # logger.debug(f"choice={choice}, backup={backup}") + if choice < 0: return None # timeout + self._use_rs = False + config = self._search_thread_pool[choice].suggest(trial_id) + skip = self._should_skip(choice, trial_id, config) + if skip: + if choice: + # logger.info(f"skipping choice={choice}, config={config}") + return None + # use rs + self._use_rs = True + for _, generated in generate_variants( + {'config': self._ls.space}): + config = generated['config'] + break + # logger.debug(f"random config {config}") + skip = self._should_skip(choice, trial_id, config) + if skip: return None + # if not choice: logger.info(config) + if choice or backup == choice or self._valid(config): + # LS or valid or no backup choice + self._trial_proposed_by[trial_id] = choice + else: # invalid config proposed by GS + if not self._use_rs: + self._search_thread_pool[choice].on_trial_complete( + trial_id, {}, error=True) # tell GS there is an error + self._use_rs = False + config = self._search_thread_pool[backup].suggest(trial_id) + skip = self._should_skip(backup, trial_id, config) + if skip: + return None + self._trial_proposed_by[trial_id] = backup + choice = backup + # if choice: self._pending.add(choice) # local search thread pending + if not choice: + if self._ls._resource: + # TODO: add resource to config proposed by GS, min or median? + config[self._ls.prune_attr] = self._ls.min_resource + self._result[self._ls.config_signature(config)] = {} + else: # use init config + init_config = self._points_to_evaluate.pop( + 0) if self._points_to_evaluate else self._ls.init_config + if init_config==self._ls.init_config: + config = self._ls.complete_config(init_config, + self._admissible_min, self._admissible_max) + # logger.info(f"reset config to {config}") + else: config = init_config + config_signature = self._ls.config_signature(config) + result = self._result.get(config_signature) + if result: # tried before + # self.on_trial_complete(trial_id, result) + return None + elif result is None: # not tried before + self._result[config_signature] = {} + else: return None # running but no result yet + self._init_used = True + self._trial_proposed_by[trial_id] = 0 + # logger.info(f"config={config}") + return config + + def _should_skip(self, choice, trial_id, config) -> bool: + ''' if config is None or config's result is known or above mem threshold + return True; o.w. return False + ''' + if config is None: return True + config_signature = self._ls.config_signature(config) + exists = config_signature in self._result + # check mem constraint + if not exists and self._mem_threshold and self._mem_size( + config)>self._mem_threshold: + self._result[config_signature] = { + self._metric:np.inf*self._ls.metric_op, 'time_total_s':1} + exists = True + if exists: + if not self._use_rs: + result = self._result.get(config_signature) + if result: + self._search_thread_pool[choice].on_trial_complete( + trial_id, result, error=False) + if choice: + # local search thread + self._clean(choice) + else: + # tell the thread there is an error + self._search_thread_pool[choice].on_trial_complete( + trial_id, {}, error=True) + return True + return False + + def _select_thread(self) -> Tuple: + ''' thread selector; use can_suggest to check LS availability + ''' + # update priority + min_eci = self._deadline - time.time() + if min_eci <= 0: return -1, -1 + max_speed = 0 + for thread in self._search_thread_pool.values(): + if thread.speed > max_speed: max_speed = thread.speed + for thread in self._search_thread_pool.values(): + thread.update_eci(self._metric_target, max_speed) + if thread.eci < min_eci: min_eci = thread.eci + for thread in self._search_thread_pool.values(): + thread.update_priority(min_eci) + + top_thread_id = backup_thread_id = 0 + priority1 = priority2 = self._search_thread_pool[0].priority + # logger.debug(f"priority of thread 0={priority1}") + for thread_id, thread in self._search_thread_pool.items(): + # if thread_id: + # logger.debug( + # f"priority of thread {thread_id}={thread.priority}") + # logger.debug( + # f"thread {thread_id}.can_suggest={thread.can_suggest}") + if thread_id and thread.can_suggest: + priority = thread.priority + if priority > priority1: + priority1 = priority + top_thread_id = thread_id + if priority > priority2 or backup_thread_id == 0: + priority2 = priority + backup_thread_id = thread_id + return top_thread_id, backup_thread_id + + def _valid(self, config: Dict) -> bool: + ''' config validator + ''' + for key in self._admissible_min: + if key in config: + value = config[key] + # logger.info( + # f"{key},{value},{self._admissible_min[key]},{self._admissible_max[key]}") + if valueself._admissible_max[key]: + return False + return True + + +class CFO(BlendSearch): + ''' class for CFO algorithm + Number of threads is 1 or 2. Thread 0 is a vacuous thread. + ''' + + __name__ = 'CFO' + + def suggest(self, trial_id: str) -> Optional[Dict]: + assert len(self._search_thread_pool)<3, len(self._search_thread_pool) + if len(self._search_thread_pool) < 2: + # When a local converges, the number of threads is 1. + # Need to restart + self._init_used = False + return super().suggest(trial_id) + + def _select_thread(self) -> Tuple: + for key in self._search_thread_pool: + if key: return key, key + + def _create_condition(self, result: Dict) -> bool: + ''' create thread condition + ''' + return len(self._search_thread_pool) < 2 diff --git a/flaml/searcher/flow2.py b/flaml/searcher/flow2.py new file mode 100644 index 000000000..a4f244b96 --- /dev/null +++ b/flaml/searcher/flow2.py @@ -0,0 +1,588 @@ +'''! + * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the + * project root for license information. +''' +from typing import Dict, Optional +import numpy as np +try: + from ray.tune.suggest import Searcher + from ray.tune.suggest.variant_generator import generate_variants + from ray.tune import sample +except ImportError: + from .suggestion import Searcher + from .variant_generator import generate_variants + from ..tune import sample + + +import logging +logger = logging.getLogger(__name__) + + +class FLOW2(Searcher): + '''Local search algorithm FLOW2, with adaptive step size + ''' + + STEPSIZE = 0.1 + STEP_LOWER_BOUND = 0.0001 + cost_attr = 'time_total_s' + + def __init__(self, + init_config: dict, + metric: Optional[str] = None, + mode: Optional[str] = None, + cat_hp_cost: Optional[dict] = None, + space: Optional[dict] = None, + prune_attr: Optional[str] = None, + min_resource: Optional[float] = None, + max_resource: Optional[float] = None, + resource_multiple_factor: Optional[float] = 4, + seed: Optional[int] = 20): + '''Constructor + + Args: + init_config: a dictionary from a subset of controlled dimensions + to the initial low-cost values. e.g. {'epochs':1} + metric: A string of the metric name to optimize for. + minimization or maximization. + mode: A string in ['min', 'max'] to specify the objective as + cat_hp_cost: A dictionary from a subset of categorical dimensions + to the relative cost of each choice. + e.g., + + .. code-block:: python + + {'tree_method': [1, 1, 2]} + + i.e., the relative cost of the + three choices of 'tree_method' is 1, 1 and 2 respectively. + space: A dictionary to specify the search space. + prune_attr: A string of the attribute used for pruning. + Not necessarily in space. + When prune_attr is in space, it is a hyperparameter, e.g., + 'n_iters', and the best value is unknown. + When prune_attr is not in space, it is a resource dimension, + e.g., 'sample_size', and the peak performance is assumed + to be at the max_resource. + min_resource: A float of the minimal resource to use for the + prune_attr; only valid if prune_attr is not in space. + max_resource: A float of the maximal resource to use for the + prune_attr; only valid if prune_attr is not in space. + resource_multiple_factor: A float of the multiplicative factor + used for increasing resource. + seed: An integer of the random seed. + ''' + if mode: + assert mode in ["min", "max"], "`mode` must be 'min' or 'max'." + else: + mode = "min" + + super(FLOW2, self).__init__( + metric=metric, + mode=mode) + # internally minimizes, so "max" => -1 + if mode == "max": + self.metric_op = -1. + elif mode == "min": + self.metric_op = 1. + self.space = space or {} + self._random = np.random.RandomState(seed) + self._seed = seed + if not init_config: + logger.warning( + "No init config given to FLOW2. Using random initial config." + "For cost-frugal search, " + "consider providing init values for cost-related hps via " + "'init_config'." + ) + self.init_config = self.best_config = init_config + self.cat_hp_cost = cat_hp_cost + self.prune_attr = prune_attr + self.min_resource = min_resource + self.resource_multiple_factor = resource_multiple_factor or 4 + self.max_resource = max_resource + self._resource = None + self._step_lb = np.Inf + if space: + self._init_search() + + def _init_search(self): + self._tunable_keys = [] + self._bounded_keys = [] + # choices of numeric values. integer encoding. + # value: (ordered list of choices, + # dict from choice to index in the ordered list) + self._ordered_choice_hp = {} + # choices with given cost. integer encoding. + # value: (array of choices ordered by cost, + # dict from choice to index in the ordered array) + self._ordered_cat_hp = {} + # unordered choices. value: cardinality + self._unordered_cat_hp = {} + self._cat_hp_cost = {} + for key, domain in self.space.items(): + assert not isinstance(domain, dict), \ + key+"'s domain is grid search which is not supported in FLOW2." + if callable(getattr(domain, 'get_sampler', None)): + self._tunable_keys.append(key) + sampler = domain.get_sampler() + if isinstance(sampler, sample.Quantized): + sampler_inner = sampler.get_sampler() + if str(sampler_inner) == 'Uniform': + self._step_lb = min( + self._step_lb, sampler.q/(domain.upper-domain.lower)) + elif isinstance(domain, sample.Integer) and str( + sampler) == 'Uniform': + self._step_lb = min( + self._step_lb, 1.0/(domain.upper-domain.lower)) + elif isinstance(domain, sample.Categorical): + cat_hp_cost = self.cat_hp_cost + if cat_hp_cost and key in cat_hp_cost: + cost = np.array(cat_hp_cost[key]) + ind = np.argsort(cost) + l = np.array(domain.categories)[ind] + cost = self._cat_hp_cost[key] = cost[ind] + d = {} + for i, choice in enumerate(l): + d[choice] = i + self._ordered_cat_hp[key] = (l, d) + self._step_lb = min(self._step_lb, 1.0/len(l)) + elif all(isinstance(x, int) or isinstance(x, float) + for x in domain.categories): + l = sorted(domain.categories) + d = {} + for i, choice in enumerate(l): + d[choice] = i + self._ordered_choice_hp[key] = (l, d) + self._step_lb = min(self._step_lb, 1.0/len(l)) + else: + self._unordered_cat_hp[key] = l = len(domain.categories) + self._step_lb = min(self._step_lb, 1.0/l) + if str(sampler) != 'Normal': + self._bounded_keys.append(key) + self._space_keys = list(self.space.keys()) + if (self.prune_attr and self.prune_attr not in self.space and + self.max_resource): + self._space_keys.append(self.prune_attr) + self.min_resource = self.min_resource or self._min_resource() + self._resource = self._round(self.min_resource) + # logger.info(min_resource) + # logger.info(max_resource) + # logger.info(self._resource) + else: self._resource = None + self.incumbent = {} + self.incumbent = self.normalize(self.init_config) + self.best_obj = self.cost_incumbent = None + self.dim = len(self._tunable_keys) # total # tunable dimensions + self._direction_tried = None + self._num_complete4incumbent = self._cost_complete4incumbent = 0 + self._num_allowed4incumbent = 2 * self.dim + self._proposed_by = {} # trial_id: int -> incumbent: Dict + self.step = self.STEPSIZE * np.sqrt(self.dim) + lb = self.step_lower_bound + if lb > self.step: self.step = lb * 2 + # upper bound + self.step_ub = np.sqrt(self.dim) + if self.step > self.step_ub: self.step = self.step_ub + # maximal # consecutive no improvements + self.dir = 2**(self.dim) + self._configs = {} # dict from trial_id to config + self._K = 0 + self._iter_best_config = self.trial_count = 1 + self._reset_times = 0 + + @property + def step_lower_bound(self) -> float: + step_lb = self._step_lb + for key in self._tunable_keys: + domain = self.space[key] + sampler = domain.get_sampler() + if isinstance(sampler, sample.Quantized): + sampler_inner = sampler.get_sampler() + if str(sampler_inner) == 'LogUniform': + step_lb = min(step_lb, + np.log(1.0+sampler.q/self.best_config[key])/ + np.log(domain.upper/domain.lower)) + elif isinstance(domain, sample.Integer) and str( + sampler) == 'LogUniform': + step_lb = min(step_lb, + np.log(1.0+1.0/self.best_config[key])/ + np.log(domain.upper/domain.lower)) + if np.isinf(step_lb): step_lb = self.STEP_LOWER_BOUND + else: step_lb *= np.sqrt(self.dim) + return step_lb + + @property + def resource(self) -> float: + return self._resource + + def _min_resource(self) -> float: + ''' automatically decide minimal resource + ''' + return self.max_resource / np.pow(self.resource_multiple_factor, 5) + + def _round(self, resource) -> float: + ''' round the resource to self.max_resource if close to it + ''' + if resource * self.resource_multiple_factor > self.max_resource: + return self.max_resource + return resource + + def rand_vector_gaussian(self, dim, std = 1.0): + vec = self._random.normal(0, std, dim) + return vec + + def complete_config(self, partial_config: Dict, + lower: Optional[Dict] = None, upper: Optional[Dict] = None) -> Dict: + ''' generate a complete config from the partial config input + add minimal resource to config if available + ''' + if self._reset_times: # not the first time, use random gaussian + normalized = self.normalize(partial_config) + for key in normalized: + # don't change unordered cat choice + if key not in self._unordered_cat_hp: + if upper and lower: + u, l = upper[key], lower[key] + gauss_std = u-l + # allowed bound + u += self.STEPSIZE + l -= self.STEPSIZE + elif key in self._bounded_keys: + u, l, gauss_std = 1, 0, 1.0 + else: u, l, gauss_std = np.Inf, -np.Inf, 1.0 + if key in self._bounded_keys: + u = min(u, 1) + l = max(l, 0) + delta = self.rand_vector_gaussian(1, gauss_std)[0] + normalized[key] = max(l, min(u, normalized[key] + delta)) + # use best config for unordered cat choice + config = self.denormalize(normalized) + else: + config = partial_config.copy() + + for key, value in self.space.items(): + if key not in config: + config[key] = value + logger.debug(f'before random {config}') + for _, generated in generate_variants({'config': config}): + config = generated['config'] + break + logger.debug(f'after random {config}') + + if self._resource: + config[self.prune_attr] = self.min_resource + self._reset_times += 1 + return config + + def create(self, init_config: Dict, obj: float, cost: float) -> Searcher: + flow2 = FLOW2(init_config, self.metric, self.mode, self._cat_hp_cost, + self.space, self.prune_attr, self.min_resource, + self.max_resource, self.resource_multiple_factor, + self._seed+1) + flow2.best_obj = obj * self.metric_op # minimize internally + flow2.cost_incumbent = cost + return flow2 + + def normalize(self, config) -> Dict: + ''' normalize each dimension in config to [0,1] + ''' + config_norm = {} + for key, value in config.items(): + if key in self.space: + # domain: sample.Categorical/Integer/Float/Function + domain = self.space[key] + if not callable(getattr(domain, 'get_sampler', None)): + config_norm[key] = value + else: + if isinstance(domain, sample.Categorical): + # normalize categorical + if key in self._ordered_cat_hp: + l, d = self._ordered_cat_hp[key] + config_norm[key] = d[value]/len(l) + elif key in self._ordered_choice_hp: + l, d = self._ordered_choice_hp[key] + config_norm[key] = d[value]/len(l) + elif key in self.incumbent: + config_norm[key] = self.incumbent[ + key] if value == self.best_config[ + key] else (self.incumbent[ + key]+1)%self._unordered_cat_hp[key] + else: config_norm[key] = 0 + continue + # Uniform/LogUniform/Normal/Base + sampler = domain.get_sampler() + if isinstance(sampler, sample.Quantized): + # sampler is sample.Quantized + sampler = sampler.get_sampler() + if str(sampler) == 'LogUniform': + config_norm[key] = np.log( + value/domain.lower)/np.log(domain.upper/domain.lower) + elif str(sampler) == 'Uniform': + config_norm[key] = ( + value-domain.lower)/(domain.upper-domain.lower) + elif str(sampler) == 'Normal': + # N(mean, sd) -> N(0,1) + config_norm[key] = (value - sampler.mean) / sampler.sd + else: + # TODO? elif str(sampler) == 'Base': # sample.Function._CallSampler + # e.g., {test: sample_from(lambda spec: randn(10, 2).sample() * 0.01)} + config_norm[key] = value + # print(key+"'s value is not normalized") + else: # prune_attr + config_norm[key] = value + return config_norm + + def denormalize(self, config): + ''' denormalize each dimension in config from [0,1] + ''' + config_denorm = {} + for key, value in config.items(): + if key in self.space: + # domain: sample.Categorical/Integer/Float/Function + domain = self.space[key] + if not callable(getattr(domain, 'get_sampler', None)): + config_denorm[key] = value + else: + if isinstance(domain, sample.Categorical): + # denormalize categorical + if key in self._ordered_cat_hp: + l, _ = self._ordered_cat_hp[key] + n = len(l) + config_denorm[key] = l[min(n-1,int(np.floor(value*n)))] + elif key in self._ordered_choice_hp: + l, _ = self._ordered_choice_hp[key] + n = len(l) + config_denorm[key] = l[min(n-1,int(np.floor(value*n)))] + else: + assert key in self.incumbent + if round(value) == self.incumbent[key]: + config_denorm[key] = self.best_config[key] + else: # ****random value each time!**** + config_denorm[key] = self._random.choice([x + for x in domain.categories + if x!=self.best_config[key]]) + continue + # Uniform/LogUniform/Normal/Base + sampler = domain.get_sampler() + if isinstance(sampler, sample.Quantized): + # sampler is sample.Quantized + sampler = sampler.get_sampler() + # Handle Log/Uniform + if str(sampler) == 'LogUniform': + config_denorm[key] = ( + domain.upper/domain.lower)**value*domain.lower + elif str(sampler) == 'Uniform': + config_denorm[key] = value * ( + domain.upper-domain.lower) + domain.lower + elif str(sampler) == 'Normal': + # denormalization for 'Normal' + config_denorm[key] = value * sampler.sd + sampler.mean + else: + config_denorm[key] = value + # Handle quantized + sampler = domain.get_sampler() + if isinstance(sampler, sample.Quantized): + config_denorm[key] = np.round( + np.divide(config_denorm[key], sampler.q)) * sampler.q + # Handle int (4.6 -> 5) + if isinstance(domain, sample.Integer): + config_denorm[key] = int(round(config_denorm[key])) + # Handle int (4.6 -> 4) + # config_denorm[key] = domain.cast(config_denorm[key]) + else: # prune_attr + config_denorm[key] = value + return config_denorm + + def set_search_properties(self, + metric: Optional[str] = None, + mode: Optional[str] = None, + config: Optional[Dict] = None) -> bool: + if metric: + self._metric = metric + if mode: + assert mode in ["min", "max"], "`mode` must be 'min' or 'max'." + if mode == "max": + self.metric_op = -1. + elif mode == "min": + self.metric_op = 1. + if config: + self.space = config + self._init_search() + return True + + def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None, + error: bool = False): + ''' compare with incumbent + ''' + # if better, move, reset num_complete and num_proposed + # if not better and num_complete >= 2*dim, num_allowed += 2 + self.trial_count += 1 + if not error and result: + obj = result.get(self._metric) + if obj: + obj *= self.metric_op + if obj < self.best_obj: + self.best_obj, self.best_config = obj, self._configs[ + trial_id] + self.incumbent = self.normalize(self.best_config) + self.cost_incumbent = result.get(self.cost_attr) + if self._resource: + self._resource = self.best_config[self.prune_attr] + self._num_complete4incumbent = 0 + self._cost_complete4incumbent = 0 + self._num_allowed4incumbent = 2 * self.dim + self._proposed_by.clear() + if self._K > 0: + self.step *= np.sqrt(self._K/self._oldK) + if self.step > self.step_ub: self.step = self.step_ub + self._iter_best_config = self.trial_count + return + proposed_by = self._proposed_by.get(trial_id) + if proposed_by == self.incumbent: + # proposed by current incumbent and no better + self._num_complete4incumbent += 1 + cost = result.get(self.cost_attr) + if cost: self._cost_complete4incumbent += cost + if self._num_complete4incumbent >= 2*self.dim and \ + self._num_allowed4incumbent == 0: + self._num_allowed4incumbent = 2 + if self._num_complete4incumbent == self.dir and (not self._resource + or self._resource == self.max_resource): + # check stuck condition if using max resource + if self.step >= self.step_lower_bound: + # decrease step size + self._oldK = self._K if self._K else self._iter_best_config + self._K = self.trial_count+1 + self.step *= np.sqrt(self._oldK/self._K) + # logger.info(f"step={self.step}, lb={self.step_lower_bound}") + self._num_complete4incumbent -= 2 + if self._num_allowed4incumbent < 2: + self._num_allowed4incumbent = 2 + # elif proposed_by: # proposed by older incumbent + # del self._proposed_by[trial_id] + + def on_trial_result(self, trial_id: str, result: Dict): + ''' early update of incumbent + ''' + if result: + obj = result.get(self._metric) + if obj: + obj *= self.metric_op + if obj < self.best_obj: + self.best_obj = obj + config = self._configs[trial_id] + if self.best_config != config: + self.best_config = config + if self._resource: + self._resource = config[self.prune_attr] + self.incumbent = self.normalize(self.best_config) + self.cost_incumbent = result.get(self.cost_attr) + self._cost_complete4incumbent = 0 + self._num_complete4incumbent = 0 + self._num_allowed4incumbent = 2 * self.dim + self._proposed_by.clear() + self._iter_best_config = self.trial_count + + def rand_vector_unit_sphere(self, dim) -> np.ndarray: + vec = self._random.normal(0, 1, dim) + mag = np.linalg.norm(vec) + return vec/mag + + def suggest(self, trial_id: str) -> Optional[Dict]: + ''' suggest a new config, one of the following cases: + 1. same incumbent, increase resource + 2. same resource, move from the incumbent to a random direction + 3. same resource, move from the incumbent to the opposite direction + ''' + if self._num_complete4incumbent > 0 and self.cost_incumbent and \ + self._resource and self._resource < self.max_resource and ( + self._cost_complete4incumbent >= + self.cost_incumbent * self.resource_multiple_factor): + # consider increasing resource using sum eval cost of complete + # configs + self._resource = self._round( + self._resource * self.resource_multiple_factor) + config = self.best_config.copy() + config[self.prune_attr] = self._resource + # self.incumbent[self.prune_attr] = self._resource + self._direction_tried = None + self._configs[trial_id] = config + return config + self._num_allowed4incumbent -= 1 + move = self.incumbent.copy() + if self._direction_tried is not None: + # return negative direction + for i, key in enumerate(self._tunable_keys): + move[key] -= self._direction_tried[i] + self._direction_tried = None + # propose a new direction + self._direction_tried = self.rand_vector_unit_sphere( + self.dim) * self.step + for i, key in enumerate(self._tunable_keys): + move[key] += self._direction_tried[i] + self._project(move) + config = self.denormalize(move) + self._proposed_by[trial_id] = self.incumbent + self._configs[trial_id] = config + return config + + def _project(self, config): + ''' project normalized config in the feasible region and set prune_attr + ''' + for key in self._bounded_keys: + value = config[key] + config[key] = max(0, min(1, value)) + if self._resource: config[self.prune_attr] = self._resource + + @property + def can_suggest(self) -> bool: + ''' can't suggest if 2*dim configs have been proposed for the incumbent + while fewer are completed + ''' + return self._num_allowed4incumbent > 0 + + def config_signature(self, config) -> tuple: + ''' return the signature tuple of a config + ''' + value_list = [] + for key in self._space_keys: + if key in config: + value = config[key] + if key == self.prune_attr: + value_list.append(value) + # else key must be in self.space + # get rid of list type or constant, + # e.g., "eval_metric": ["logloss", "error"] + elif callable(getattr(self.space[key], 'sample', None)): + if isinstance(self.space[key], sample.Integer): + value_list.append(int(round(value))) + else: + value_list.append(value) + else: + value_list.append(None) + return tuple(value_list) + + @property + def converged(self) -> bool: + ''' return whether the local search has converged + ''' + if self._num_complete4incumbent < self.dir-2: return False + # check stepsize after enough configs are completed + return self.step < self.step_lower_bound + + def reach(self, other: Searcher) -> bool: + ''' whether the incumbent can reach the incumbent of other + ''' + config1, config2 = self.best_config, other.best_config + incumbent1, incumbent2 = self.incumbent, other.incumbent + if self._resource and config1[self.prune_attr]>config2[self.prune_attr]: + # resource will not decrease + return False + for key in self._unordered_cat_hp: + # unordered cat choice is hard to reach by chance + if config1[key] != config2[key]: return False + delta = np.array([incumbent1[key]-incumbent2[key] + for key in self._tunable_keys]) + return np.linalg.norm(delta) <= self.step + diff --git a/flaml/searcher/search_thread.py b/flaml/searcher/search_thread.py new file mode 100644 index 000000000..ed280ff46 --- /dev/null +++ b/flaml/searcher/search_thread.py @@ -0,0 +1,132 @@ +'''! + * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the + * project root for license information. +''' +from typing import Dict, Optional +import numpy as np +try: + from ray.tune.suggest import Searcher +except ImportError: + from .suggestion import Searcher +from .flow2 import FLOW2 + +import logging +logger = logging.getLogger(__name__) + + +class SearchThread: + '''Class of global or local search thread + ''' + + cost_attr = 'time_total_s' + + def __init__(self, mode: str = "min", + search_alg: Optional[Searcher] = None): + ''' When search_alg is omitted, use local search FLOW2 + ''' + self._search_alg = search_alg + self._mode = mode + self._metric_op = 1 if mode=='min' else -1 + self.cost_best = self.cost_last = self.cost_total = self.cost_best1 = \ + getattr(search_alg, 'cost_incumbent', 0) + self.cost_best2 = 0 + self.obj_best1 = self.obj_best2 = getattr( + search_alg, 'best_obj', np.inf) # inherently minimize + # eci: expected cost for improvement + self.eci = self.cost_best + self.priority = self.speed = 0 + + def suggest(self, trial_id: str) -> Optional[Dict]: + ''' use the suggest() of the underlying search algorithm + ''' + if isinstance(self._search_alg, FLOW2): + config = self._search_alg.suggest(trial_id) + else: + try: + config = self._search_alg.suggest(trial_id) + except: + logger.warning( + f'The global search method raises error. ' + 'Ignoring for this iteration.') + config = None + return config + + def update_priority(self, eci: Optional[float] = 0): + # optimistic projection + self.priority = eci * self.speed - self.obj_best1 + + def update_eci(self, metric_target: float, + max_speed: Optional[float] = np.inf): + # calculate eci: expected cost for improvement over metric_target; + best_obj = metric_target * self._metric_op + if not self.speed: self.speed = max_speed + self.eci = max(self.cost_total - self.cost_best1, + self.cost_best1 - self.cost_best2) + if self.obj_best1 > best_obj and self.speed > 0: + self.eci = max(self.eci, 2*(self.obj_best1-best_obj)/self.speed) + + def _update_speed(self): + # calculate speed; use 0 for invalid speed temporarily + if self.obj_best2 > self.obj_best1: + self.speed = (self.obj_best2 - self.obj_best1) / ( + self.cost_total - self.cost_best2) + else: self.speed = 0 + + def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None, + error: bool = False): + ''' update the statistics of the thread + ''' + if not self._search_alg: return + if not hasattr(self._search_alg, '_ot_trials') or (not error and + trial_id in self._search_alg._ot_trials): + # optuna doesn't handle error + self._search_alg.on_trial_complete(trial_id, result, error) + if result: + if self.cost_attr in result: + self.cost_last = result[self.cost_attr] + self.cost_total += self.cost_last + # if not isinstance(self._search_alg, FLOW2): + # logger.info(f"result.metric{result[self._search_alg.metric]}") + if self._search_alg.metric in result: + obj = result[self._search_alg.metric] * self._metric_op + if obj < self.obj_best1: + self.cost_best2 = self.cost_best1 + self.cost_best1 = self.cost_total + self.obj_best2 = obj if np.isinf( + self.obj_best1) else self.obj_best1 + self.obj_best1 = obj + self.cost_best = self.cost_last + self._update_speed() + + def on_trial_result(self, trial_id: str, result: Dict): + ''' TODO update the statistics of the thread with partial result? + ''' + # print('[SearchThread] on trial result') + if not self._search_alg: return + if not hasattr(self._search_alg, '_ot_trials') or ( + trial_id in self._search_alg._ot_trials): + self._search_alg.on_trial_result(trial_id, result) + if self.cost_attr in result and self.cost_last < result[self.cost_attr]: + self.cost_last = result[self.cost_attr] + # self._update_speed() + + @property + def converged(self) -> bool: + return self._search_alg.converged + + @property + def resource(self) -> float: + return self._search_alg.resource + + def reach(self, thread) -> bool: + ''' whether the incumbent can reach the incumbent of thread + ''' + return self._search_alg.reach(thread._search_alg) + + @property + def can_suggest(self) -> bool: + ''' whether the thread can suggest new configs + ''' + return self._search_alg.can_suggest + diff --git a/flaml/searcher/suggestion.py b/flaml/searcher/suggestion.py new file mode 100644 index 000000000..4fc73d9fc --- /dev/null +++ b/flaml/searcher/suggestion.py @@ -0,0 +1,661 @@ +''' +Copyright 2020 The Ray Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This source file is adapted here because ray does not fully support Windows. +''' +import copy +import glob +import logging +import os +from typing import Dict, Optional, Union, List, Tuple + +logger = logging.getLogger(__name__) + +UNRESOLVED_SEARCH_SPACE = str( + "You passed a `{par}` parameter to {cls} that contained unresolved search " + "space definitions. {cls} should however be instantiated with fully " + "configured search spaces only. To use Ray Tune's automatic search space " + "conversion, pass the space definition as part of the `config` argument " + "to `tune.run()` instead.") + +UNDEFINED_SEARCH_SPACE = str( + "Trying to sample a configuration from {cls}, but no search " + "space has been defined. Either pass the `{space}` argument when " + "instantiating the search algorithm, or pass a `config` to " + "`tune.run()`.") + +UNDEFINED_METRIC_MODE = str( + "Trying to sample a configuration from {cls}, but the `metric` " + "({metric}) or `mode` ({mode}) parameters have not been set. " + "Either pass these arguments when instantiating the search algorithm, " + "or pass them to `tune.run()`.") + + +class Searcher: + """Abstract class for wrapping suggesting algorithms. + Custom algorithms can extend this class easily by overriding the + `suggest` method provide generated parameters for the trials. + Any subclass that implements ``__init__`` must also call the + constructor of this class: ``super(Subclass, self).__init__(...)``. + To track suggestions and their corresponding evaluations, the method + `suggest` will be passed a trial_id, which will be used in + subsequent notifications. + Not all implementations support multi objectives. + Args: + metric (str or list): The training result objective value attribute. If + list then list of training result objective value attributes + mode (str or list): If string One of {min, max}. If list then + list of max and min, determines whether objective is minimizing + or maximizing the metric attribute. Must match type of metric. + .. code-block:: python + class ExampleSearch(Searcher): + def __init__(self, metric="mean_loss", mode="min", **kwargs): + super(ExampleSearch, self).__init__( + metric=metric, mode=mode, **kwargs) + self.optimizer = Optimizer() + self.configurations = {} + def suggest(self, trial_id): + configuration = self.optimizer.query() + self.configurations[trial_id] = configuration + def on_trial_complete(self, trial_id, result, **kwargs): + configuration = self.configurations[trial_id] + if result and self.metric in result: + self.optimizer.update(configuration, result[self.metric]) + tune.run(trainable_function, search_alg=ExampleSearch()) + """ + FINISHED = "FINISHED" + CKPT_FILE_TMPL = "searcher-state-{}.pkl" + + def __init__(self, + metric: Optional[str] = None, + mode: Optional[str] = None, + max_concurrent: Optional[int] = None, + use_early_stopped_trials: Optional[bool] = None): + if use_early_stopped_trials is False: + raise DeprecationWarning( + "Early stopped trials are now always used. If this is a " + "problem, file an issue: https://github.com/ray-project/ray.") + if max_concurrent is not None: + logger.warning( + "DeprecationWarning: `max_concurrent` is deprecated for this " + "search algorithm. Use tune.suggest.ConcurrencyLimiter() " + "instead. This will raise an error in future versions of Ray.") + + self._metric = metric + self._mode = mode + + if not mode or not metric: + # Early return to avoid assertions + return + + assert isinstance( + metric, type(mode)), "metric and mode must be of the same type" + if isinstance(mode, str): + assert mode in ["min", "max" + ], "if `mode` is a str must be 'min' or 'max'!" + elif isinstance(mode, list): + assert len(mode) == len( + metric), "Metric and mode must be the same length" + assert all(mod in ["min", "max", "obs"] for mod in + mode), "All of mode must be 'min' or 'max' or 'obs'!" + else: + raise ValueError("Mode most either be a list or string") + + def set_search_properties(self, metric: Optional[str], mode: Optional[str], + config: Dict) -> bool: + """Pass search properties to searcher. + This method acts as an alternative to instantiating search algorithms + with their own specific search spaces. Instead they can accept a + Tune config through this method. A searcher should return ``True`` + if setting the config was successful, or ``False`` if it was + unsuccessful, e.g. when the search space has already been set. + Args: + metric (str): Metric to optimize + mode (str): One of ["min", "max"]. Direction to optimize. + config (dict): Tune config dict. + """ + return False + + def on_trial_result(self, trial_id: str, result: Dict): + """Optional notification for result during training. + Note that by default, the result dict may include NaNs or + may not include the optimization metric. It is up to the + subclass implementation to preprocess the result to + avoid breaking the optimization process. + Args: + trial_id (str): A unique string ID for the trial. + result (dict): Dictionary of metrics for current training progress. + Note that the result dict may include NaNs or + may not include the optimization metric. It is up to the + subclass implementation to preprocess the result to + avoid breaking the optimization process. + """ + pass + + def on_trial_complete(self, + trial_id: str, + result: Optional[Dict] = None, + error: bool = False): + """Notification for the completion of trial. + Typically, this method is used for notifying the underlying + optimizer of the result. + Args: + trial_id (str): A unique string ID for the trial. + result (dict): Dictionary of metrics for current training progress. + Note that the result dict may include NaNs or + may not include the optimization metric. It is up to the + subclass implementation to preprocess the result to + avoid breaking the optimization process. Upon errors, this + may also be None. + error (bool): True if the training process raised an error. + """ + raise NotImplementedError + + def suggest(self, trial_id: str) -> Optional[Dict]: + """Queries the algorithm to retrieve the next set of parameters. + Arguments: + trial_id (str): Trial ID used for subsequent notifications. + Returns: + dict | FINISHED | None: Configuration for a trial, if possible. + If FINISHED is returned, Tune will be notified that + no more suggestions/configurations will be provided. + If None is returned, Tune will skip the querying of the + searcher for this step. + """ + raise NotImplementedError + + def save(self, checkpoint_path: str): + """Save state to path for this search algorithm. + Args: + checkpoint_path (str): File where the search algorithm + state is saved. This path should be used later when + restoring from file. + Example: + .. code-block:: python + search_alg = Searcher(...) + analysis = tune.run( + cost, + num_samples=5, + search_alg=search_alg, + name=self.experiment_name, + local_dir=self.tmpdir) + search_alg.save("./my_favorite_path.pkl") + .. versionchanged:: 0.8.7 + Save is automatically called by `tune.run`. You can use + `restore_from_dir` to restore from an experiment directory + such as `~/ray_results/trainable`. + """ + raise NotImplementedError + + def restore(self, checkpoint_path: str): + """Restore state for this search algorithm + Args: + checkpoint_path (str): File where the search algorithm + state is saved. This path should be the same + as the one provided to "save". + Example: + .. code-block:: python + search_alg.save("./my_favorite_path.pkl") + search_alg2 = Searcher(...) + search_alg2 = ConcurrencyLimiter(search_alg2, 1) + search_alg2.restore(checkpoint_path) + tune.run(cost, num_samples=5, search_alg=search_alg2) + """ + raise NotImplementedError + + def get_state(self) -> Dict: + raise NotImplementedError + + def set_state(self, state: Dict): + raise NotImplementedError + + def save_to_dir(self, checkpoint_dir: str, session_str: str = "default"): + """Automatically saves the given searcher to the checkpoint_dir. + This is automatically used by tune.run during a Tune job. + Args: + checkpoint_dir (str): Filepath to experiment dir. + session_str (str): Unique identifier of the current run + session. + """ + tmp_search_ckpt_path = os.path.join(checkpoint_dir, + ".tmp_searcher_ckpt") + success = True + try: + self.save(tmp_search_ckpt_path) + except NotImplementedError: + if log_once("suggest:save_to_dir"): + logger.warning( + "save not implemented for Searcher. Skipping save.") + success = False + + if success and os.path.exists(tmp_search_ckpt_path): + os.rename( + tmp_search_ckpt_path, + os.path.join(checkpoint_dir, + self.CKPT_FILE_TMPL.format(session_str))) + + def restore_from_dir(self, checkpoint_dir: str): + """Restores the state of a searcher from a given checkpoint_dir. + Typically, you should use this function to restore from an + experiment directory such as `~/ray_results/trainable`. + .. code-block:: python + experiment_1 = tune.run( + cost, + num_samples=5, + search_alg=search_alg, + verbose=0, + name=self.experiment_name, + local_dir="~/my_results") + search_alg2 = Searcher() + search_alg2.restore_from_dir( + os.path.join("~/my_results", self.experiment_name) + """ + + pattern = self.CKPT_FILE_TMPL.format("*") + full_paths = glob.glob(os.path.join(checkpoint_dir, pattern)) + if not full_paths: + raise RuntimeError( + "Searcher unable to find checkpoint in {}".format( + checkpoint_dir)) # TODO + most_recent_checkpoint = max(full_paths) + self.restore(most_recent_checkpoint) + + @property + def metric(self) -> str: + """The training result objective value attribute.""" + return self._metric + + @property + def mode(self) -> str: + """Specifies if minimizing or maximizing the metric.""" + return self._mode + + +class ConcurrencyLimiter(Searcher): + """A wrapper algorithm for limiting the number of concurrent trials. + Args: + searcher (Searcher): Searcher object that the + ConcurrencyLimiter will manage. + max_concurrent (int): Maximum concurrent samples from the underlying + searcher. + batch (bool): Whether to wait for all concurrent samples + to finish before updating the underlying searcher. + Example: + .. code-block:: python + from ray.tune.suggest import ConcurrencyLimiter + search_alg = HyperOptSearch(metric="accuracy") + search_alg = ConcurrencyLimiter(search_alg, max_concurrent=2) + tune.run(trainable, search_alg=search_alg) + """ + + def __init__(self, + searcher: Searcher, + max_concurrent: int, + batch: bool = False): + assert type(max_concurrent) is int and max_concurrent > 0 + self.searcher = searcher + self.max_concurrent = max_concurrent + self.batch = batch + self.live_trials = set() + self.cached_results = {} + super(ConcurrencyLimiter, self).__init__( + metric=self.searcher.metric, mode=self.searcher.mode) + + def suggest(self, trial_id: str) -> Optional[Dict]: + assert trial_id not in self.live_trials, ( + f"Trial ID {trial_id} must be unique: already found in set.") + if len(self.live_trials) >= self.max_concurrent: + logger.debug( + f"Not providing a suggestion for {trial_id} due to " + "concurrency limit: %s/%s.", len(self.live_trials), + self.max_concurrent) + return + + suggestion = self.searcher.suggest(trial_id) + if suggestion not in (None, Searcher.FINISHED): + self.live_trials.add(trial_id) + return suggestion + + def on_trial_complete(self, + trial_id: str, + result: Optional[Dict] = None, + error: bool = False): + if trial_id not in self.live_trials: + return + elif self.batch: + self.cached_results[trial_id] = (result, error) + if len(self.cached_results) == self.max_concurrent: + # Update the underlying searcher once the + # full batch is completed. + for trial_id, (result, error) in self.cached_results.items(): + self.searcher.on_trial_complete( + trial_id, result=result, error=error) + self.live_trials.remove(trial_id) + self.cached_results = {} + else: + return + else: + self.searcher.on_trial_complete( + trial_id, result=result, error=error) + self.live_trials.remove(trial_id) + + def get_state(self) -> Dict: + state = self.__dict__.copy() + del state["searcher"] + return copy.deepcopy(state) + + def set_state(self, state: Dict): + self.__dict__.update(state) + + def save(self, checkpoint_path: str): + self.searcher.save(checkpoint_path) + + def restore(self, checkpoint_path: str): + self.searcher.restore(checkpoint_path) + + def on_pause(self, trial_id: str): + self.searcher.on_pause(trial_id) + + def on_unpause(self, trial_id: str): + self.searcher.on_unpause(trial_id) + + def set_search_properties(self, metric: Optional[str], mode: Optional[str], + config: Dict) -> bool: + return self.searcher.set_search_properties(metric, mode, config) + + +import pickle +from .variant_generator import parse_spec_vars +from ..tune.sample import Categorical, Domain, Float, Integer, LogUniform, \ + Quantized, Uniform +from ..tune.trial import flatten_dict, unflatten_dict + +try: + import optuna as ot + from optuna.samplers import BaseSampler +except ImportError: + ot = None + BaseSampler = None + + +class _Param: + def __getattr__(self, item): + def _inner(*args, **kwargs): + return (item, args, kwargs) + + return _inner + + +param = _Param() + + +# (Optional) Default (anonymous) metric when using tune.report(x) +DEFAULT_METRIC = "_metric" + +# (Auto-filled) The index of this training iteration. +TRAINING_ITERATION = "training_iteration" + + +class OptunaSearch(Searcher): + """A wrapper around Optuna to provide trial suggestions. + `Optuna `_ is a hyperparameter optimization library. + In contrast to other libraries, it employs define-by-run style + hyperparameter definitions. + This Searcher is a thin wrapper around Optuna's search algorithms. + You can pass any Optuna sampler, which will be used to generate + hyperparameter suggestions. + Please note that this wrapper does not support define-by-run, so the + search space will be configured before running the optimization. You will + also need to use a Tune trainable (e.g. using the function API) with + this wrapper. + For defining the search space, use ``ray.tune.suggest.optuna.param`` + (see example). + Args: + space (list): Hyperparameter search space definition for Optuna's + sampler. This is a list, and samples for the parameters will + be obtained in order. + metric (str): The training result objective value attribute. If None + but a mode was passed, the anonymous metric `_metric` will be used + per default. + mode (str): One of {min, max}. Determines whether objective is + minimizing or maximizing the metric attribute. + points_to_evaluate (list): Initial parameter suggestions to be run + first. This is for when you already have some good parameters + you want to run first to help the algorithm make better suggestions + for future parameters. Needs to be a list of dicts containing the + configurations. + sampler (optuna.samplers.BaseSampler): Optuna sampler used to + draw hyperparameter configurations. Defaults to ``TPESampler``. + Tune automatically converts search spaces to Optuna's format: + .. code-block:: python + from ray.tune.suggest.optuna import OptunaSearch + config = { + "a": tune.uniform(6, 8) + "b": tune.uniform(10, 20) + } + optuna_search = OptunaSearch( + metric="loss", + mode="min") + tune.run(trainable, config=config, search_alg=optuna_search) + If you would like to pass the search space manually, the code would + look like this: + .. code-block:: python + from ray.tune.suggest.optuna import OptunaSearch, param + space = [ + param.suggest_uniform("a", 6, 8), + param.suggest_uniform("b", 10, 20) + ] + algo = OptunaSearch( + space, + metric="loss", + mode="min") + tune.run(trainable, search_alg=optuna_search) + .. versionadded:: 0.8.8 + """ + + def __init__(self, + space: Optional[Union[Dict, List[Tuple]]] = None, + metric: Optional[str] = None, + mode: Optional[str] = None, + points_to_evaluate: Optional[List[Dict]] = None, + sampler: Optional[BaseSampler] = None): + assert ot is not None, ( + "Optuna must be installed! Run `pip install optuna`.") + super(OptunaSearch, self).__init__( + metric=metric, + mode=mode, + max_concurrent=None, + use_early_stopped_trials=None) + + if isinstance(space, dict) and space: + resolved_vars, domain_vars, grid_vars = parse_spec_vars(space) + if domain_vars or grid_vars: + logger.warning( + UNRESOLVED_SEARCH_SPACE.format( + par="space", cls=type(self))) + space = self.convert_search_space(space) + + self._space = space + + self._points_to_evaluate = points_to_evaluate + + self._study_name = "optuna" # Fixed study name for in-memory storage + self._sampler = sampler or ot.samplers.TPESampler() + assert isinstance(self._sampler, BaseSampler), \ + "You can only pass an instance of `optuna.samplers.BaseSampler` " \ + "as a sampler to `OptunaSearcher`." + + self._pruner = ot.pruners.NopPruner() + self._storage = ot.storages.InMemoryStorage() + + self._ot_trials = {} + self._ot_study = None + if self._space: + self._setup_study(mode) + + def _setup_study(self, mode: str): + if self._metric is None and self._mode: + # If only a mode was passed, use anonymous metric + self._metric = DEFAULT_METRIC + + self._ot_study = ot.study.create_study( + storage=self._storage, + sampler=self._sampler, + pruner=self._pruner, + study_name=self._study_name, + direction="minimize" if mode == "min" else "maximize", + load_if_exists=True) + + def set_search_properties(self, metric: Optional[str], mode: Optional[str], + config: Dict) -> bool: + if self._space: + return False + space = self.convert_search_space(config) + self._space = space + if metric: + self._metric = metric + if mode: + self._mode = mode + + self._setup_study(mode) + return True + + def suggest(self, trial_id: str) -> Optional[Dict]: + if not self._space: + raise RuntimeError( + UNDEFINED_SEARCH_SPACE.format( + cls=self.__class__.__name__, space="space")) + if not self._metric or not self._mode: + raise RuntimeError( + UNDEFINED_METRIC_MODE.format( + cls=self.__class__.__name__, + metric=self._metric, + mode=self._mode)) + + if trial_id not in self._ot_trials: + ot_trial_id = self._storage.create_new_trial( + self._ot_study._study_id) + self._ot_trials[trial_id] = ot.trial.Trial(self._ot_study, + ot_trial_id) + ot_trial = self._ot_trials[trial_id] + + if self._points_to_evaluate: + params = self._points_to_evaluate.pop(0) + else: + # getattr will fetch the trial.suggest_ function on Optuna trials + params = { + args[0] if len(args) > 0 else kwargs["name"]: getattr( + ot_trial, fn)(*args, **kwargs) + for (fn, args, kwargs) in self._space + } + return unflatten_dict(params) + + def on_trial_result(self, trial_id: str, result: Dict): + metric = result[self.metric] + step = result[TRAINING_ITERATION] + ot_trial = self._ot_trials[trial_id] + ot_trial.report(metric, step) + + def on_trial_complete(self, + trial_id: str, + result: Optional[Dict] = None, + error: bool = False): + ot_trial = self._ot_trials[trial_id] + ot_trial_id = ot_trial._trial_id + self._storage.set_trial_value(ot_trial_id, result.get( + self.metric, None)) + self._storage.set_trial_state(ot_trial_id, + ot.trial.TrialState.COMPLETE) + + def save(self, checkpoint_path: str): + save_object = (self._storage, self._pruner, self._sampler, + self._ot_trials, self._ot_study, + self._points_to_evaluate) + with open(checkpoint_path, "wb") as outputFile: + pickle.dump(save_object, outputFile) + + def restore(self, checkpoint_path: str): + with open(checkpoint_path, "rb") as inputFile: + save_object = pickle.load(inputFile) + self._storage, self._pruner, self._sampler, \ + self._ot_trials, self._ot_study, \ + self._points_to_evaluate = save_object + + @staticmethod + def convert_search_space(spec: Dict) -> List[Tuple]: + resolved_vars, domain_vars, grid_vars = parse_spec_vars(spec) + + if not domain_vars and not grid_vars: + return [] + + if grid_vars: + raise ValueError( + "Grid search parameters cannot be automatically converted " + "to an Optuna search space.") + + # Flatten and resolve again after checking for grid search. + spec = flatten_dict(spec, prevent_delimiter=True) + resolved_vars, domain_vars, grid_vars = parse_spec_vars(spec) + + def resolve_value(par: str, domain: Domain) -> Tuple: + quantize = None + + sampler = domain.get_sampler() + if isinstance(sampler, Quantized): + quantize = sampler.q + sampler = sampler.sampler + + if isinstance(domain, Float): + if isinstance(sampler, LogUniform): + if quantize: + logger.warning( + "Optuna does not support both quantization and " + "sampling from LogUniform. Dropped quantization.") + return param.suggest_loguniform(par, domain.lower, + domain.upper) + elif isinstance(sampler, Uniform): + if quantize: + return param.suggest_discrete_uniform( + par, domain.lower, domain.upper, quantize) + return param.suggest_uniform(par, domain.lower, + domain.upper) + elif isinstance(domain, Integer): + if isinstance(sampler, LogUniform): + if quantize: + logger.warning( + "Optuna does not support both quantization and " + "sampling from LogUniform. Dropped quantization.") + return param.suggest_int( + par, domain.lower, domain.upper, log=True) + elif isinstance(sampler, Uniform): + return param.suggest_int( + par, domain.lower, domain.upper, step=quantize or 1) + elif isinstance(domain, Categorical): + if isinstance(sampler, Uniform): + return param.suggest_categorical(par, domain.categories) + + raise ValueError( + "Optuna search does not support parameters of type " + "`{}` with samplers of type `{}`".format( + type(domain).__name__, + type(domain.sampler).__name__)) + + # Parameter name is e.g. "a/b/c" for nested dicts + values = [ + resolve_value("/".join(path), domain) + for path, domain in domain_vars + ] + + return values \ No newline at end of file diff --git a/flaml/searcher/variant_generator.py b/flaml/searcher/variant_generator.py new file mode 100644 index 000000000..d1a80e0b2 --- /dev/null +++ b/flaml/searcher/variant_generator.py @@ -0,0 +1,396 @@ +''' +Copyright 2020 The Ray Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This source file is adapted here because ray does not fully support Windows. +''' +import copy +import logging +from collections.abc import Mapping +from typing import Any, Dict, Generator, List, Optional, Tuple + +import numpy +import random + +from ..tune.sample import Categorical, Domain, Function + +logger = logging.getLogger(__name__) + + +class TuneError(Exception): + """General error class raised by ray.tune.""" + pass + + +def generate_variants( + unresolved_spec: Dict) -> Generator[Tuple[Dict, Dict], None, None]: + """Generates variants from a spec (dict) with unresolved values. + There are two types of unresolved values: + Grid search: These define a grid search over values. For example, the + following grid search values in a spec will produce six distinct + variants in combination: + "activation": grid_search(["relu", "tanh"]) + "learning_rate": grid_search([1e-3, 1e-4, 1e-5]) + Lambda functions: These are evaluated to produce a concrete value, and + can express dependencies or conditional distributions between values. + They can also be used to express random search (e.g., by calling + into the `random` or `np` module). + "cpu": lambda spec: spec.config.num_workers + "batch_size": lambda spec: random.uniform(1, 1000) + Finally, to support defining specs in plain JSON / YAML, grid search + and lambda functions can also be defined alternatively as follows: + "activation": {"grid_search": ["relu", "tanh"]} + "cpu": {"eval": "spec.config.num_workers"} + Use `format_vars` to format the returned dict of hyperparameters. + Yields: + (Dict of resolved variables, Spec object) + """ + for resolved_vars, spec in _generate_variants(unresolved_spec): + assert not _unresolved_values(spec) + yield resolved_vars, spec + + +def grid_search(values: List) -> Dict[str, List]: + """Convenience method for specifying grid search over a value. + Arguments: + values: An iterable whose parameters will be gridded. + """ + + return {"grid_search": values} + + +_STANDARD_IMPORTS = { + "random": random, + "np": numpy, +} + +_MAX_RESOLUTION_PASSES = 20 + + +def resolve_nested_dict(nested_dict: Dict) -> Dict[Tuple, Any]: + """Flattens a nested dict by joining keys into tuple of paths. + Can then be passed into `format_vars`. + """ + res = {} + for k, v in nested_dict.items(): + if isinstance(v, dict): + for k_, v_ in resolve_nested_dict(v).items(): + res[(k, ) + k_] = v_ + else: + res[(k, )] = v + return res + + +def format_vars(resolved_vars: Dict) -> str: + """Formats the resolved variable dict into a single string.""" + out = [] + for path, value in sorted(resolved_vars.items()): + if path[0] in ["run", "env", "resources_per_trial"]: + continue # TrialRunner already has these in the experiment_tag + pieces = [] + last_string = True + for k in path[::-1]: + if isinstance(k, int): + pieces.append(str(k)) + elif last_string: + last_string = False + pieces.append(k) + pieces.reverse() + out.append(_clean_value("_".join(pieces)) + "=" + _clean_value(value)) + return ",".join(out) + + +def flatten_resolved_vars(resolved_vars: Dict) -> Dict: + """Formats the resolved variable dict into a mapping of (str -> value).""" + flattened_resolved_vars_dict = {} + for pieces, value in resolved_vars.items(): + if pieces[0] == "config": + pieces = pieces[1:] + pieces = [str(piece) for piece in pieces] + flattened_resolved_vars_dict["/".join(pieces)] = value + return flattened_resolved_vars_dict + + +def _clean_value(value: Any) -> str: + if isinstance(value, float): + return "{:.5}".format(value) + else: + return str(value).replace("/", "_") + + +def parse_spec_vars(spec: Dict) -> Tuple[List[Tuple[Tuple, Any]], List[Tuple[ + Tuple, Any]], List[Tuple[Tuple, Any]]]: + resolved, unresolved = _split_resolved_unresolved_values(spec) + resolved_vars = list(resolved.items()) + + if not unresolved: + return resolved_vars, [], [] + + grid_vars = [] + domain_vars = [] + for path, value in unresolved.items(): + if value.is_grid(): + grid_vars.append((path, value)) + else: + domain_vars.append((path, value)) + grid_vars.sort() + + return resolved_vars, domain_vars, grid_vars + + +def count_variants(spec: Dict, presets: Optional[List[Dict]] = None) -> int: + # Helper function: Deep update dictionary + def deep_update(d, u): + for k, v in u.items(): + if isinstance(v, Mapping): + d[k] = deep_update(d.get(k, {}), v) + else: + d[k] = v + return d + + # Count samples for a specific spec + def spec_samples(spec, num_samples=1): + _, domain_vars, grid_vars = parse_spec_vars(spec) + grid_count = 1 + for path, domain in grid_vars: + grid_count *= len(domain.categories) + return num_samples * grid_count + + total_samples = 0 + total_num_samples = spec.get("num_samples", 1) + # For each preset, overwrite the spec and count the samples generated + # for this preset + for preset in presets: + preset_spec = copy.deepcopy(spec) + deep_update(preset_spec["config"], preset) + total_samples += spec_samples(preset_spec, 1) + total_num_samples -= 1 + + # Add the remaining samples + if total_num_samples > 0: + total_samples += spec_samples(spec, total_num_samples) + return total_samples + + +def _generate_variants(spec: Dict) -> Tuple[Dict, Dict]: + spec = copy.deepcopy(spec) + _, domain_vars, grid_vars = parse_spec_vars(spec) + + if not domain_vars and not grid_vars: + yield {}, spec + return + + grid_search = _grid_search_generator(spec, grid_vars) + for resolved_spec in grid_search: + resolved_vars = _resolve_domain_vars(resolved_spec, domain_vars) + for resolved, spec in _generate_variants(resolved_spec): + for path, value in grid_vars: + resolved_vars[path] = _get_value(spec, path) + for k, v in resolved.items(): + if (k in resolved_vars and v != resolved_vars[k] + and _is_resolved(resolved_vars[k])): + raise ValueError( + "The variable `{}` could not be unambiguously " + "resolved to a single value. Consider simplifying " + "your configuration.".format(k)) + resolved_vars[k] = v + yield resolved_vars, spec + + +def get_preset_variants(spec: Dict, config: Dict): + """Get variants according to a spec, initialized with a config. + Variables from the spec are overwritten by the variables in the config. + Thus, we may end up with less sampled parameters. + This function also checks if values used to overwrite search space + parameters are valid, and logs a warning if not. + """ + spec = copy.deepcopy(spec) + + resolved, _, _ = parse_spec_vars(config) + + for path, val in resolved: + try: + domain = _get_value(spec["config"], path) + if isinstance(domain, dict): + if "grid_search" in domain: + domain = Categorical(domain["grid_search"]) + else: + # If users want to overwrite an entire subdict, + # let them do it. + domain = None + except IndexError as exc: + raise ValueError( + f"Pre-set config key `{'/'.join(path)}` does not correspond " + f"to a valid key in the search space definition. Please add " + f"this path to the `config` variable passed to `tune.run()`." + ) from exc + + if domain and not domain.is_valid(val): + logger.warning( + f"Pre-set value `{val}` is not within valid values of " + f"parameter `{'/'.join(path)}`: {domain.domain_str}") + assign_value(spec["config"], path, val) + + return _generate_variants(spec) + + +def assign_value(spec: Dict, path: Tuple, value: Any): + for k in path[:-1]: + spec = spec[k] + spec[path[-1]] = value + + +def _get_value(spec: Dict, path: Tuple) -> Any: + for k in path: + spec = spec[k] + return spec + + +def _resolve_domain_vars(spec: Dict, + domain_vars: List[Tuple[Tuple, Domain]]) -> Dict: + resolved = {} + error = True + num_passes = 0 + while error and num_passes < _MAX_RESOLUTION_PASSES: + num_passes += 1 + error = False + for path, domain in domain_vars: + if path in resolved: + continue + try: + value = domain.sample(_UnresolvedAccessGuard(spec)) + except RecursiveDependencyError as e: + error = e + except Exception: + raise ValueError( + "Failed to evaluate expression: {}: {}".format( + path, domain)) + else: + assign_value(spec, path, value) + resolved[path] = value + if error: + raise error + return resolved + + +def _grid_search_generator(unresolved_spec: Dict, + grid_vars: List) -> Generator[Dict, None, None]: + value_indices = [0] * len(grid_vars) + + def increment(i): + value_indices[i] += 1 + if value_indices[i] >= len(grid_vars[i][1]): + value_indices[i] = 0 + if i + 1 < len(value_indices): + return increment(i + 1) + else: + return True + return False + + if not grid_vars: + yield unresolved_spec + return + + while value_indices[-1] < len(grid_vars[-1][1]): + spec = copy.deepcopy(unresolved_spec) + for i, (path, values) in enumerate(grid_vars): + assign_value(spec, path, values[value_indices[i]]) + yield spec + if grid_vars: + done = increment(0) + if done: + break + + +def _is_resolved(v) -> bool: + resolved, _ = _try_resolve(v) + return resolved + + +def _try_resolve(v) -> Tuple[bool, Any]: + if isinstance(v, Domain): + # Domain to sample from + return False, v + elif isinstance(v, dict) and len(v) == 1 and "eval" in v: + # Lambda function in eval syntax + return False, Function( + lambda spec: eval(v["eval"], _STANDARD_IMPORTS, {"spec": spec})) + elif isinstance(v, dict) and len(v) == 1 and "grid_search" in v: + # Grid search values + grid_values = v["grid_search"] + if not isinstance(grid_values, list): + raise TuneError( + "Grid search expected list of values, got: {}".format( + grid_values)) + return False, Categorical(grid_values).grid() + return True, v + + +def _split_resolved_unresolved_values( + spec: Dict) -> Tuple[Dict[Tuple, Any], Dict[Tuple, Any]]: + resolved_vars = {} + unresolved_vars = {} + for k, v in spec.items(): + resolved, v = _try_resolve(v) + if not resolved: + unresolved_vars[(k, )] = v + elif isinstance(v, dict): + # Recurse into a dict + _resolved_children, _unresolved_children = \ + _split_resolved_unresolved_values(v) + for (path, value) in _resolved_children.items(): + resolved_vars[(k, ) + path] = value + for (path, value) in _unresolved_children.items(): + unresolved_vars[(k, ) + path] = value + elif isinstance(v, list): + # Recurse into a list + for i, elem in enumerate(v): + _resolved_children, _unresolved_children = \ + _split_resolved_unresolved_values({i: elem}) + for (path, value) in _resolved_children.items(): + resolved_vars[(k, ) + path] = value + for (path, value) in _unresolved_children.items(): + unresolved_vars[(k, ) + path] = value + else: + resolved_vars[(k, )] = v + return resolved_vars, unresolved_vars + + +def _unresolved_values(spec: Dict) -> Dict[Tuple, Any]: + return _split_resolved_unresolved_values(spec)[1] + + +def has_unresolved_values(spec: Dict) -> bool: + return True if _unresolved_values(spec) else False + + +class _UnresolvedAccessGuard(dict): + def __init__(self, *args, **kwds): + super(_UnresolvedAccessGuard, self).__init__(*args, **kwds) + self.__dict__ = self + + def __getattribute__(self, item): + value = dict.__getattribute__(self, item) + if not _is_resolved(value): + raise RecursiveDependencyError( + "`{}` recursively depends on {}".format(item, value)) + elif isinstance(value, dict): + return _UnresolvedAccessGuard(value) + else: + return value + + +class RecursiveDependencyError(Exception): + def __init__(self, msg: str): + Exception.__init__(self, msg) \ No newline at end of file diff --git a/flaml/space.py b/flaml/space.py deleted file mode 100644 index 8bc2cb1ad..000000000 --- a/flaml/space.py +++ /dev/null @@ -1,249 +0,0 @@ -'''! - * Copyright (c) 2020 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. -''' - - -class ConfigSearchInfo: - '''The class of the search space of a hyperparameters: - - Attributes: - name: A string of the name of the hyperparameter - type: data type of the hyperparameter - lower: A number of the lower bound of the value - upper: A number of the upper bound of the value - init: A number of the initial value. For hyperparameters related to - complexity, the init value needs to correspond to the lowest - complexity - change_tpe: A string of the change type, 'linear' or 'log' - min_change: A number of the minimal change required. Could be inf if - no such requirement - ''' - - def __init__(self, name, type, lower, upper, init, change_type = 'log', - complexity_related = True, min_change = None): - self.name = name - self.type = type - self.lower = lower - self.upper = upper - self.init = init - self.change_type = change_type - self.complexity_related = complexity_related - # default setting of min_change: if type is int, min_change - # should be 1, otherwise +inf - if min_change is None: - if self.type == int: - self.min_change = 1.0 #minimum change required, - else: - self.min_change = float('+inf') - else: - self.min_change = min_change - - -def config_space(estimator, data_size, objective_name = "regression"): - CS = {} - n_estimators_upper = min(32768,int(data_size)) - max_leaves_upper = min(32768,int(data_size)) - # exp_max_depth_upper = min(32768,data_size) - if 'xgboost' in estimator: - CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators', - type = int, lower = 4, init = 4, upper = n_estimators_upper, - change_type = 'log') - CS['max_leaves'] = ConfigSearchInfo(name = 'max_leaves', type =int, - lower = 4, init = 4, upper = max_leaves_upper, change_type = 'log') - CS['min_child_weight'] = ConfigSearchInfo(name = 'min_child_weight', - type = float, lower = 0.001, init = 20.0, upper = 20.0, - change_type = 'log') - - CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate', - type = float, lower = 0.01, init = 0.1, upper = 1.0, - change_type = 'log') - CS['subsample'] = ConfigSearchInfo(name = 'subsample', type = float, - lower = 0.6, init = 1.0, upper = 1.0, change_type = 'linear') - CS['reg_alpha'] = ConfigSearchInfo(name = 'reg_alpha', type = float, - lower = 1e-10, init = 1e-10, upper = 1.0, change_type = 'log', - complexity_related = True) - CS['reg_lambda'] = ConfigSearchInfo(name = 'reg_lambda', type = float, - lower = 1e-10, init = 1.0, upper = 1.0, change_type = 'log') - CS['colsample_bylevel'] = ConfigSearchInfo(name = 'colsample_bylevel', - type = float, lower = 0.6, init = 1.0, upper = 1.0, - change_type = 'linear') - CS['colsample_bytree'] = ConfigSearchInfo(name = 'colsample_bytree', - type = float, lower = 0.7, init = 1.0, upper = 1.0, - change_type = 'linear') - elif estimator in ('rf', 'extra_tree'): - n_estimators_upper = min(2048, n_estimators_upper) - # max_leaves_upper = min(2048, max_leaves_upper) - CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators', - type = int, lower = 4, init = 4, upper = n_estimators_upper, - change_type = 'log') - if objective_name != 'regression': - CS['criterion'] = ConfigSearchInfo(name = 'criterion', - type = int, lower = 1, init = 1, upper = 2, - change_type = 'log') - - # CS['max_leaves'] = ConfigSearchInfo(name = 'max_leaves', type =int, - # lower = 4, init = 4, upper = max_leaves_upper, change_type = 'log', - # complexity_related = True) - - CS['max_features'] = ConfigSearchInfo(name = 'max_features', type = float, - lower = 0.1, init = 1.0, upper = 1.0, change_type = 'log') - # CS['min_samples_split'] = ConfigSearchInfo(name = 'min_samples_split', - # type = int, lower = 2, init = 2, upper = 20, change_type = 'log', - # complexity_related = True) - # CS['min_samples_leaf'] = ConfigSearchInfo(name = 'min_samples_leaf', - # type = int, lower = 1, init = 1, upper = 20, change_type = 'log', - # complexity_related = True) - elif 'lgbm' in estimator: - CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators', type = int, - lower = 4, init = 4, upper = n_estimators_upper, change_type = 'log') - CS['max_leaves'] = ConfigSearchInfo(name = 'max_leaves', type = int, - lower = 4, init = 4, upper = max_leaves_upper, change_type = 'log') - CS['min_child_weight'] = ConfigSearchInfo(name = 'min_child_weight', - type = float, lower = 0.001, init = 20, upper = 20.0, - change_type = 'log') - - CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate', - type = float, lower = 0.01, init = 0.1, upper = 1.0, - change_type = 'log') - CS['subsample'] = ConfigSearchInfo(name = 'subsample', type = float, - lower = 0.6, init = 1.0, upper = 1.0, change_type = 'log', - complexity_related = True) - CS['log_max_bin'] = ConfigSearchInfo(name = 'log_max_bin', type = int, - lower = 3, init = 8, upper = 10, change_type = 'log', - complexity_related = True) - CS['reg_alpha'] = ConfigSearchInfo(name = 'reg_alpha', type = float, - lower = 1e-10, init = 1e-10, upper = 1.0, change_type = 'log', - complexity_related = True) - CS['reg_lambda'] = ConfigSearchInfo(name = 'reg_lambda', type = float, - lower = 1e-10, init = 1.0, upper = 1.0, change_type = 'log') - CS['colsample_bytree'] = ConfigSearchInfo(name = 'colsample_bytree', - type = float, lower = 0.7, init = 1.0, upper = 1.0, - change_type = 'log') - elif 'lr' in estimator: - CS['C'] = ConfigSearchInfo(name = 'C', type =float, lower = 0.03125, - init = 1.0, upper = 32768.0, change_type = 'log', - complexity_related = True) - elif 'catboost' in estimator: - # CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators', type = int, - # lower = 4, init = 64, upper = n_estimators_upper, change_type = 'log', - # complexity_related = True) - early_stopping_rounds = max(min(round(1500000/data_size),150), 10) - CS['rounds'] = ConfigSearchInfo(name = 'rounds', type = int, - lower = 10, init = 10, - upper = early_stopping_rounds, change_type = 'log') - # CS['exp_max_depth'] = ConfigSearchInfo(name = 'exp_max_depth', type = int, - # lower = 32, init = 64, upper = 256, change_type = 'log', - # complexity_related = True) - - CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate', - type = float, lower = 0.005, init = 0.1, upper = .2, - change_type = 'log') - # CS['l2_leaf_reg'] = ConfigSearchInfo(name = 'l2_leaf_reg', - # type = float, lower = 1, init = 3, upper = 5, - # change_type = 'log') - elif 'nn' == estimator: - CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate', - type = float, lower = 1e-4, init = 3e-4, upper = 3e-2, - change_type = 'log') - CS['weight_decay'] = ConfigSearchInfo(name = 'weight_decay', - type = float, lower = 1e-12, init = 1e-6, upper = .1, - change_type = 'log') - CS['dropout_prob'] = ConfigSearchInfo(name = 'dropout_prob', - type = float, lower = 1.0, init = 1.1, upper = 1.5, - change_type = 'log') - elif 'kneighbor' in estimator: - n_neighbors_upper = min(512,int(data_size/2)) - CS['n_neighbors'] = ConfigSearchInfo(name = 'n_neighbors', type = int, - lower = 1, init = 5, upper = n_neighbors_upper, change_type = 'log') - else: - raise NotImplementedError - - return CS - - -def estimator_size(config, estimator): - if estimator in ['xgboost', 'lgbm', 'rf', 'extra_tree']: - try: - max_leaves = int(round(config['max_leaves'])) - n_estimators = int(round(config['n_estimators'])) - model_size = float((max_leaves*3 + (max_leaves-1)*4 + 1)* - n_estimators*8) - except: - model_size = 0 - return model_size - elif 'catboost' in estimator: - # if config is None: raise Exception("config is none") - n_estimators = int(round(config.get('n_estimators',8192))) - max_leaves = int(round(config.get('exp_max_depth',64))) - model_size = float((max_leaves*3 + (max_leaves-1)*4 + 1)* - n_estimators*8) - return model_size - else: - model_size = 1.0 - # raise NotImplementedError - return model_size - - -def generate_config_ini(estimator, estimator_configspace): - - - config_dic = {} - config_dic_more = {} - config_type_dic = {} - for _, config in estimator_configspace.items(): - name, init = config.name, config.init - type_, complexity_related = config.type, config.complexity_related - config_type_dic[name] = type_ - if complexity_related: - config_dic[name] = init - else: - config_dic_more[name] = init - return config_dic, config_dic_more, {**config_dic, **config_dic_more}, \ - config_type_dic - - -def generate_config_min(estimator,estimator_configspace, max_config_size): - - - config_dic = {} - config_dic_more = {} - for _, config in estimator_configspace.items(): - name, lower = config.name, config.lower - complexity_related = config.complexity_related - if complexity_related: - config_dic[name] = lower - else: - config_dic_more[name] = lower - - return config_dic, config_dic_more, {**config_dic, **config_dic_more} - - -def generate_config_max(estimator, estimator_configspace, max_config_size): - - - config_dic = {} - config_dic_more = {} - for _, config in estimator_configspace.items(): - name, upper = config.name, config.upper - complexity_related = config.complexity_related - if complexity_related: - if name in ('n_estimators', 'max_leaves'): - config_dic[name] = min(upper, max_config_size) - else: - config_dic[name] = upper - else: - config_dic_more[name] = upper - return config_dic, config_dic_more, {**config_dic, **config_dic_more} - - -def get_config_values(config_dic, config_type_dic): - value_list = [] - for k in config_dic.keys(): - org_v = config_dic[k] - if config_type_dic[k] == int: - v = int(round(org_v)) - value_list.append(v) - else: - value_list.append(org_v) - return value_list diff --git a/flaml/training_log.py b/flaml/training_log.py index 6de6831b4..f8bc1d19c 100644 --- a/flaml/training_log.py +++ b/flaml/training_log.py @@ -1,5 +1,5 @@ '''! - * Copyright (c) 2020 Microsoft Corporation. All rights reserved. + * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. ''' diff --git a/flaml/tune/README.md b/flaml/tune/README.md new file mode 100644 index 000000000..935dbec40 --- /dev/null +++ b/flaml/tune/README.md @@ -0,0 +1,181 @@ +# Economical Hyperparameter Optimization + +`flaml.tune` is a module for economical hyperparameter tuning. It frees users from manually tuning many hyperparameters for a software, such as machine learning training procedures. +The API is compatible with ray tune. + +Example: + +```python +from flaml import tune +import time + +def evaluate_config(config): + '''evaluate a hyperparameter configuration''' + # we uss a toy example with 2 hyperparameters + metric = (round(config['x'])-85000)**2 - config['x']/config['y'] + # usually the evaluation takes an non-neglible cost + # and the cost could be related to certain hyperparameters + # in this example, we assume it's proportional to x + time.sleep(config['x']/100000) + # use tune.report to report the metric to optimize + tune.report(metric=metric) + +analysis = tune.run( + evaluate_config, # the function to evaluate a config + config={ + 'x': tune.qloguniform(lower=1, upper=100000, q=1), + 'y': tune.randint(lower=1, upper=100000) + }, # the search space + init_config={'x':1}, # a initial (partial) config with low cost + metric='metric', # the name of the metric used for optimization + mode='min', # the optimization mode, 'min' or 'max' + num_samples=-1, # the maximal number of configs to try, -1 means infinite + time_budget_s=60, # the time budget in seconds + local_dir='logs/', # the local directory to store logs + # verbose=0, # verbosity + # use_ray=True, # uncomment when performing parallel tuning using ray + ) + +print(analysis.best_trial.last_result) # the best trial's result +print(analysis.best_config) # the best config +``` + +Or, using ray tune's API: +```python +from ray import tune as raytune +from flaml import CFO, BlendSearch +import time + +def evaluate_config(config): + '''evaluate a hyperparameter configuration''' + # we uss a toy example with 2 hyperparameters + metric = (round(config['x'])-85000)**2 - config['x']/config['y'] + # usually the evaluation takes an non-neglible cost + # and the cost could be related to certain hyperparameters + # in this example, we assume it's proportional to x + time.sleep(config['x']/100000) + # use tune.report to report the metric to optimize + tune.report(metric=metric) + +analysis = raytune.run( + evaluate_config, # the function to evaluate a config + config={ + 'x': tune.qloguniform(lower=1, upper=100000, q=1), + 'y': tune.randint(lower=1, upper=100000) + }, # the search space + metric='metric', # the name of the metric used for optimization + mode='min', # the optimization mode, 'min' or 'max' + num_samples=-1, # the maximal number of configs to try, -1 means infinite + time_budget_s=60, # the time budget in seconds + local_dir='logs/', # the local directory to store logs + search_alg=CFO(points_to_evaluate=[{'x':1}]) # or BlendSearch + # other algo example: raytune.create_searcher('optuna'), + ) + +print(analysis.best_trial.last_result) # the best trial's result +print(analysis.best_config) # the best config +``` + +For more examples, please check out +[notebooks](https://github.com/microsoft/FLAML/tree/main/notebook/). + + +`flaml` offers two HPO methods: CFO and BlendSearch. +`flaml.tune` uses BlendSearch by default. + +## CFO: Frugal Optimization for Cost-related Hyperparameters + +

+ +
+

+ +CFO uses the randomized direct search method FLOW2 with adaptive stepsize and random restart. +It requires a low-cost initial point as input if such point exists. +The search begins with the low-cost initial point and gradually move to +high cost region if needed. The local search method has a provable convergence +rate and bounded cost. + +About FLOW2: FLOW2 is a simple yet effective randomized direct search method. +It is an iterative optimization method that can optimize for black-box functions. +FLOW2 only requires pairwise comparisons between function values to perform iterative update. Comparing to existing HPO methods, FLOW2 has the following appealing properties: +1. It is applicable to general black-box functions with a good convergence rate in terms of loss. +3. It provides theoretical guarantees on the total evaluation cost incurred. + +The GIFs attached below demostrates an example search trajectory of FLOW2 shown in the loss and evaluation cost (i.e., the training time ) space respectively. From the demonstration, we can see that (1) FLOW2 can quickly move toward the low-loss region, showing good convergence property and (2) FLOW2 tends to avoid exploring the high-cost region until necessary. + +

+ +
+

Figure 1. FLOW2 in tuning the # of leaves and the # of trees for XGBoost. The two background heatmaps show the loss and cost distribution of all configurations. The black dots are the points evaluated in FLOW2. Black dots connected by lines are points that yield better loss performance when evaluated.
+

+ + +Example: + +```python +from flaml import CFO +tune.run(... + search_alg = CFO(points_to_evaluate=[init_config]), +) +``` + +Recommended scenario: there exist cost-related hyperparameters and a low-cost +initial point is known before optimization. +If the search space is complex and CFO gets trapped into local optima, consider +using BlendSearch. + +## BlendSearch: Economical Hyperparameter Optimization With Blended Search Strategy + +

+ +
+

+ +BlendSearch combines local search with global search. It leverages the frugality +of CFO and the space exploration ability of global search methods such as +Bayesian optimization. Like CFO, BlendSearch requires a low-cost initial point +as input if such point exists, and starts the search from there. Different from +CFO, BlendSearch will not wait for the local search to fully converge before +trying new start points. The new start points are suggested by the global search +method and filtered based on their distance to the existing points in the +cost-related dimensions. BlendSearch still gradually increases the trial cost. +It prioritizes among the global search thread and multiple local search threads +based on optimism in face of uncertainty. + +Example: + +```python +from flaml import BlendSearch +tune.run(... + search_alg = BlendSearch(points_to_evaluate=[init_config]), +) +``` + +Recommended scenario: cost-related hyperparameters exist, a low-cost +initial point is known, and the search space is complex such that local search +is prone to be stuck at local optima. + +For more technical details, please check our papers. + +* [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021. + +``` +@inproceedings{wu2021cfo, + title={Frugal Optimization for Cost-related Hyperparameters}, + author={Qingyun Wu and Chi Wang and Silu Huang}, + year={2021}, + booktitle={AAAI'21}, +} +``` + +* Economical Hyperparameter Optimization With Blended Search Strategy. Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. To appear in ICLR 2021. + +``` +@inproceedings{wang2021blendsearch, + title={Economical Hyperparameter Optimization With Blended Search Strategy}, + author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied}, + year={2021}, + booktitle={ICLR'21}, +} +``` \ No newline at end of file diff --git a/flaml/tune/__init__.py b/flaml/tune/__init__.py new file mode 100644 index 000000000..5aed4e2ef --- /dev/null +++ b/flaml/tune/__init__.py @@ -0,0 +1,7 @@ +try: + from ray.tune import (uniform, quniform, choice, randint, qrandint, randn, + qrandn, loguniform, qloguniform) +except: + from .sample import (uniform, quniform, choice, randint, qrandint, randn, + qrandn, loguniform, qloguniform) +from .tune import run, report \ No newline at end of file diff --git a/flaml/tune/analysis.py b/flaml/tune/analysis.py new file mode 100644 index 000000000..a9b996752 --- /dev/null +++ b/flaml/tune/analysis.py @@ -0,0 +1,180 @@ +''' +Copyright 2020 The Ray Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This source file is adapted here because ray does not fully support Windows. +''' +from typing import Dict, Optional +import numpy as np +from .trial import Trial + +import logging +logger = logging.getLogger(__name__) + + +def is_nan_or_inf(value): + return np.isnan(value) or np.isinf(value) + + +class ExperimentAnalysis: + """Analyze results from a Tune experiment. + """ + + @property + def best_trial(self) -> Trial: + """Get the best trial of the experiment + The best trial is determined by comparing the last trial results + using the `metric` and `mode` parameters passed to `tune.run()`. + If you didn't pass these parameters, use + `get_best_trial(metric, mode, scope)` instead. + """ + if not self.default_metric or not self.default_mode: + raise ValueError( + "To fetch the `best_trial`, pass a `metric` and `mode` " + "parameter to `tune.run()`. Alternatively, use the " + "`get_best_trial(metric, mode)` method to set the metric " + "and mode explicitly.") + return self.get_best_trial(self.default_metric, self.default_mode) + + @property + def best_config(self) -> Dict: + """Get the config of the best trial of the experiment + The best trial is determined by comparing the last trial results + using the `metric` and `mode` parameters passed to `tune.run()`. + If you didn't pass these parameters, use + `get_best_config(metric, mode, scope)` instead. + """ + if not self.default_metric or not self.default_mode: + raise ValueError( + "To fetch the `best_config`, pass a `metric` and `mode` " + "parameter to `tune.run()`. Alternatively, use the " + "`get_best_config(metric, mode)` method to set the metric " + "and mode explicitly.") + return self.get_best_config(self.default_metric, self.default_mode) + + def _validate_metric(self, metric: str) -> str: + if not metric and not self.default_metric: + raise ValueError( + "No `metric` has been passed and `default_metric` has " + "not been set. Please specify the `metric` parameter.") + return metric or self.default_metric + + def _validate_mode(self, mode: str) -> str: + if not mode and not self.default_mode: + raise ValueError( + "No `mode` has been passed and `default_mode` has " + "not been set. Please specify the `mode` parameter.") + if mode and mode not in ["min", "max"]: + raise ValueError("If set, `mode` has to be one of [min, max]") + return mode or self.default_mode + + def get_best_trial(self, + metric: Optional[str] = None, + mode: Optional[str] = None, + scope: str = "last", + filter_nan_and_inf: bool = True) -> Optional[Trial]: + """Retrieve the best trial object. + Compares all trials' scores on ``metric``. + If ``metric`` is not specified, ``self.default_metric`` will be used. + If `mode` is not specified, ``self.default_mode`` will be used. + These values are usually initialized by passing the ``metric`` and + ``mode`` parameters to ``tune.run()``. + Args: + metric (str): Key for trial info to order on. Defaults to + ``self.default_metric``. + mode (str): One of [min, max]. Defaults to ``self.default_mode``. + scope (str): One of [all, last, avg, last-5-avg, last-10-avg]. + If `scope=last`, only look at each trial's final step for + `metric`, and compare across trials based on `mode=[min,max]`. + If `scope=avg`, consider the simple average over all steps + for `metric` and compare across trials based on + `mode=[min,max]`. If `scope=last-5-avg` or `scope=last-10-avg`, + consider the simple average over the last 5 or 10 steps for + `metric` and compare across trials based on `mode=[min,max]`. + If `scope=all`, find each trial's min/max score for `metric` + based on `mode`, and compare trials based on `mode=[min,max]`. + filter_nan_and_inf (bool): If True (default), NaN or infinite + values are disregarded and these trials are never selected as + the best trial. + """ + metric = self._validate_metric(metric) + mode = self._validate_mode(mode) + + if scope not in ["all", "last", "avg", "last-5-avg", "last-10-avg"]: + raise ValueError( + "ExperimentAnalysis: attempting to get best trial for " + "metric {} for scope {} not in [\"all\", \"last\", \"avg\", " + "\"last-5-avg\", \"last-10-avg\"]. " + "If you didn't pass a `metric` parameter to `tune.run()`, " + "you have to pass one when fetching the best trial.".format( + metric, scope)) + best_trial = None + best_metric_score = None + for trial in self.trials: + if metric not in trial.metric_analysis: + continue + + if scope in ["last", "avg", "last-5-avg", "last-10-avg"]: + metric_score = trial.metric_analysis[metric][scope] + else: + metric_score = trial.metric_analysis[metric][mode] + + if filter_nan_and_inf and is_nan_or_inf(metric_score): + continue + + if best_metric_score is None: + best_metric_score = metric_score + best_trial = trial + continue + + if (mode == "max") and (best_metric_score < metric_score): + best_metric_score = metric_score + best_trial = trial + elif (mode == "min") and (best_metric_score > metric_score): + best_metric_score = metric_score + best_trial = trial + + if not best_trial: + logger.warning( + "Could not find best trial. Did you pass the correct `metric` " + "parameter?") + return best_trial + + def get_best_config(self, + metric: Optional[str] = None, + mode: Optional[str] = None, + scope: str = "last") -> Optional[Dict]: + """Retrieve the best config corresponding to the trial. + Compares all trials' scores on `metric`. + If ``metric`` is not specified, ``self.default_metric`` will be used. + If `mode` is not specified, ``self.default_mode`` will be used. + These values are usually initialized by passing the ``metric`` and + ``mode`` parameters to ``tune.run()``. + Args: + metric (str): Key for trial info to order on. Defaults to + ``self.default_metric``. + mode (str): One of [min, max]. Defaults to ``self.default_mode``. + scope (str): One of [all, last, avg, last-5-avg, last-10-avg]. + If `scope=last`, only look at each trial's final step for + `metric`, and compare across trials based on `mode=[min,max]`. + If `scope=avg`, consider the simple average over all steps + for `metric` and compare across trials based on + `mode=[min,max]`. If `scope=last-5-avg` or `scope=last-10-avg`, + consider the simple average over the last 5 or 10 steps for + `metric` and compare across trials based on `mode=[min,max]`. + If `scope=all`, find each trial's min/max score for `metric` + based on `mode`, and compare trials based on `mode=[min,max]`. + """ + best_trial = self.get_best_trial(metric, mode, scope) + return best_trial.config if best_trial else None diff --git a/flaml/tune/sample.py b/flaml/tune/sample.py new file mode 100644 index 000000000..13519928a --- /dev/null +++ b/flaml/tune/sample.py @@ -0,0 +1,535 @@ +''' +Copyright 2020 The Ray Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This source file is included here because ray does not fully support Windows. +''' +import logging +import random +from copy import copy +from inspect import signature +from math import isclose +from typing import Any, Callable, Dict, List, Optional, Sequence, Union + +import numpy as np + +logger = logging.getLogger(__name__) + + +class Domain: + """Base class to specify a type and valid range to sample parameters from. + This base class is implemented by parameter spaces, like float ranges + (``Float``), integer ranges (``Integer``), or categorical variables + (``Categorical``). The ``Domain`` object contains information about + valid values (e.g. minimum and maximum values), and exposes methods that + allow specification of specific samplers (e.g. ``uniform()`` or + ``loguniform()``). + """ + sampler = None + default_sampler_cls = None + + def cast(self, value): + """Cast value to domain type""" + return value + + def set_sampler(self, sampler, allow_override=False): + if self.sampler and not allow_override: + raise ValueError("You can only choose one sampler for parameter " + "domains. Existing sampler for parameter {}: " + "{}. Tried to add {}".format( + self.__class__.__name__, self.sampler, + sampler)) + self.sampler = sampler + + def get_sampler(self): + sampler = self.sampler + if not sampler: + sampler = self.default_sampler_cls() + return sampler + + def sample(self, spec=None, size=1): + sampler = self.get_sampler() + return sampler.sample(self, spec=spec, size=size) + + def is_grid(self): + return isinstance(self.sampler, Grid) + + def is_function(self): + return False + + def is_valid(self, value: Any): + """Returns True if `value` is a valid value in this domain.""" + raise NotImplementedError + + @property + def domain_str(self): + return "(unknown)" + + +class Sampler: + def sample(self, + domain: Domain, + spec: Optional[Union[List[Dict], Dict]] = None, + size: int = 1): + raise NotImplementedError + + +class BaseSampler(Sampler): + def __str__(self): + return "Base" + + +class Uniform(Sampler): + def __str__(self): + return "Uniform" + + +class LogUniform(Sampler): + def __init__(self, base: float = 10): + self.base = base + assert self.base > 0, "Base has to be strictly greater than 0" + + def __str__(self): + return "LogUniform" + + +class Normal(Sampler): + def __init__(self, mean: float = 0., sd: float = 0.): + self.mean = mean + self.sd = sd + + assert self.sd > 0, "SD has to be strictly greater than 0" + + def __str__(self): + return "Normal" + + +class Grid(Sampler): + """Dummy sampler used for grid search""" + + def sample(self, + domain: Domain, + spec: Optional[Union[List[Dict], Dict]] = None, + size: int = 1): + return RuntimeError("Do not call `sample()` on grid.") + + +class Float(Domain): + class _Uniform(Uniform): + def sample(self, + domain: "Float", + spec: Optional[Union[List[Dict], Dict]] = None, + size: int = 1): + assert domain.lower > float("-inf"), \ + "Uniform needs a lower bound" + assert domain.upper < float("inf"), \ + "Uniform needs a upper bound" + items = np.random.uniform(domain.lower, domain.upper, size=size) + return items if len(items) > 1 else domain.cast(items[0]) + + class _LogUniform(LogUniform): + def sample(self, + domain: "Float", + spec: Optional[Union[List[Dict], Dict]] = None, + size: int = 1): + assert domain.lower > 0, \ + "LogUniform needs a lower bound greater than 0" + assert 0 < domain.upper < float("inf"), \ + "LogUniform needs a upper bound greater than 0" + logmin = np.log(domain.lower) / np.log(self.base) + logmax = np.log(domain.upper) / np.log(self.base) + + items = self.base**(np.random.uniform(logmin, logmax, size=size)) + return items if len(items) > 1 else domain.cast(items[0]) + + class _Normal(Normal): + def sample(self, + domain: "Float", + spec: Optional[Union[List[Dict], Dict]] = None, + size: int = 1): + assert not domain.lower or domain.lower == float("-inf"), \ + "Normal sampling does not allow a lower value bound." + assert not domain.upper or domain.upper == float("inf"), \ + "Normal sampling does not allow a upper value bound." + items = np.random.normal(self.mean, self.sd, size=size) + return items if len(items) > 1 else domain.cast(items[0]) + + default_sampler_cls = _Uniform + + def __init__(self, lower: Optional[float], upper: Optional[float]): + # Need to explicitly check for None + self.lower = lower if lower is not None else float("-inf") + self.upper = upper if upper is not None else float("inf") + + def cast(self, value): + return float(value) + + def uniform(self): + if not self.lower > float("-inf"): + raise ValueError( + "Uniform requires a lower bound. Make sure to set the " + "`lower` parameter of `Float()`.") + if not self.upper < float("inf"): + raise ValueError( + "Uniform requires a upper bound. Make sure to set the " + "`upper` parameter of `Float()`.") + new = copy(self) + new.set_sampler(self._Uniform()) + return new + + def loguniform(self, base: float = 10): + if not self.lower > 0: + raise ValueError( + "LogUniform requires a lower bound greater than 0." + f"Got: {self.lower}. Did you pass a variable that has " + "been log-transformed? If so, pass the non-transformed value " + "instead.") + if not 0 < self.upper < float("inf"): + raise ValueError( + "LogUniform requires a upper bound greater than 0. " + f"Got: {self.lower}. Did you pass a variable that has " + "been log-transformed? If so, pass the non-transformed value " + "instead.") + new = copy(self) + new.set_sampler(self._LogUniform(base)) + return new + + def normal(self, mean=0., sd=1.): + new = copy(self) + new.set_sampler(self._Normal(mean, sd)) + return new + + def quantized(self, q: float): + if self.lower > float("-inf") and not isclose(self.lower / q, + round(self.lower / q)): + raise ValueError( + f"Your lower variable bound {self.lower} is not divisible by " + f"quantization factor {q}.") + if self.upper < float("inf") and not isclose(self.upper / q, + round(self.upper / q)): + raise ValueError( + f"Your upper variable bound {self.upper} is not divisible by " + f"quantization factor {q}.") + + new = copy(self) + new.set_sampler(Quantized(new.get_sampler(), q), allow_override=True) + return new + + def is_valid(self, value: float): + return self.lower <= value <= self.upper + + @property + def domain_str(self): + return f"({self.lower}, {self.upper})" + + +class Integer(Domain): + class _Uniform(Uniform): + def sample(self, + domain: "Integer", + spec: Optional[Union[List[Dict], Dict]] = None, + size: int = 1): + items = np.random.randint(domain.lower, domain.upper, size=size) + return items if len(items) > 1 else domain.cast(items[0]) + + class _LogUniform(LogUniform): + def sample(self, + domain: "Integer", + spec: Optional[Union[List[Dict], Dict]] = None, + size: int = 1): + assert domain.lower > 0, \ + "LogUniform needs a lower bound greater than 0" + assert 0 < domain.upper < float("inf"), \ + "LogUniform needs a upper bound greater than 0" + logmin = np.log(domain.lower) / np.log(self.base) + logmax = np.log(domain.upper) / np.log(self.base) + + items = self.base**(np.random.uniform(logmin, logmax, size=size)) + items = np.round(items).astype(int) + return items if len(items) > 1 else domain.cast(items[0]) + + default_sampler_cls = _Uniform + + def __init__(self, lower, upper): + self.lower = lower + self.upper = upper + + def cast(self, value): + return int(value) + + def quantized(self, q: int): + new = copy(self) + new.set_sampler(Quantized(new.get_sampler(), q), allow_override=True) + return new + + def uniform(self): + new = copy(self) + new.set_sampler(self._Uniform()) + return new + + def loguniform(self, base: float = 10): + if not self.lower > 0: + raise ValueError( + "LogUniform requires a lower bound greater than 0." + f"Got: {self.lower}. Did you pass a variable that has " + "been log-transformed? If so, pass the non-transformed value " + "instead.") + if not 0 < self.upper < float("inf"): + raise ValueError( + "LogUniform requires a upper bound greater than 0. " + f"Got: {self.lower}. Did you pass a variable that has " + "been log-transformed? If so, pass the non-transformed value " + "instead.") + new = copy(self) + new.set_sampler(self._LogUniform(base)) + return new + + def is_valid(self, value: int): + return self.lower <= value <= self.upper + + @property + def domain_str(self): + return f"({self.lower}, {self.upper})" + + +class Categorical(Domain): + class _Uniform(Uniform): + def sample(self, + domain: "Categorical", + spec: Optional[Union[List[Dict], Dict]] = None, + size: int = 1): + + items = random.choices(domain.categories, k=size) + return items if len(items) > 1 else domain.cast(items[0]) + + default_sampler_cls = _Uniform + + def __init__(self, categories: Sequence): + self.categories = list(categories) + + def uniform(self): + new = copy(self) + new.set_sampler(self._Uniform()) + return new + + def grid(self): + new = copy(self) + new.set_sampler(Grid()) + return new + + def __len__(self): + return len(self.categories) + + def __getitem__(self, item): + return self.categories[item] + + def is_valid(self, value: Any): + return value in self.categories + + @property + def domain_str(self): + return f"{self.categories}" + + +class Function(Domain): + class _CallSampler(BaseSampler): + def sample(self, + domain: "Function", + spec: Optional[Union[List[Dict], Dict]] = None, + size: int = 1): + if domain.pass_spec: + items = [ + domain.func(spec[i] if isinstance(spec, list) else spec) + for i in range(size) + ] + else: + items = [domain.func() for i in range(size)] + + return items if len(items) > 1 else domain.cast(items[0]) + + default_sampler_cls = _CallSampler + + def __init__(self, func: Callable): + sig = signature(func) + + pass_spec = True # whether we should pass `spec` when calling `func` + try: + sig.bind({}) + except TypeError: + pass_spec = False + + if not pass_spec: + try: + sig.bind() + except TypeError as exc: + raise ValueError( + "The function passed to a `Function` parameter must be " + "callable with either 0 or 1 parameters.") from exc + + self.pass_spec = pass_spec + self.func = func + + def is_function(self): + return True + + def is_valid(self, value: Any): + return True # This is user-defined, so lets not assume anything + + @property + def domain_str(self): + return f"{self.func}()" + + +class Quantized(Sampler): + def __init__(self, sampler: Sampler, q: Union[float, int]): + self.sampler = sampler + self.q = q + + assert self.sampler, "Quantized() expects a sampler instance" + + def get_sampler(self): + return self.sampler + + def sample(self, + domain: Domain, + spec: Optional[Union[List[Dict], Dict]] = None, + size: int = 1): + values = self.sampler.sample(domain, spec, size) + quantized = np.round(np.divide(values, self.q)) * self.q + if not isinstance(quantized, np.ndarray): + return domain.cast(quantized) + return list(quantized) + + +# TODO (krfricke): Remove tune.function +def function(func): + logger.warning( + "DeprecationWarning: wrapping {} with tune.function() is no " + "longer needed".format(func)) + return func + + +def sample_from(func: Callable[[Dict], Any]): + """Specify that tune should sample configuration values from this function. + Arguments: + func: An callable function to draw a sample from. + """ + return Function(func) + + +def uniform(lower: float, upper: float): + """Sample a float value uniformly between ``lower`` and ``upper``. + Sampling from ``tune.uniform(1, 10)`` is equivalent to sampling from + ``np.random.uniform(1, 10))`` + """ + return Float(lower, upper).uniform() + + +def quniform(lower: float, upper: float, q: float): + """Sample a quantized float value uniformly between ``lower`` and ``upper``. + Sampling from ``tune.uniform(1, 10)`` is equivalent to sampling from + ``np.random.uniform(1, 10))`` + The value will be quantized, i.e. rounded to an integer increment of ``q``. + Quantization makes the upper bound inclusive. + """ + return Float(lower, upper).uniform().quantized(q) + + +def loguniform(lower: float, upper: float, base: float = 10): + """Sugar for sampling in different orders of magnitude. + Args: + lower (float): Lower boundary of the output interval (e.g. 1e-4) + upper (float): Upper boundary of the output interval (e.g. 1e-2) + base (int): Base of the log. Defaults to 10. + """ + return Float(lower, upper).loguniform(base) + + +def qloguniform(lower: float, upper: float, q: float, base: float = 10): + """Sugar for sampling in different orders of magnitude. + The value will be quantized, i.e. rounded to an integer increment of ``q``. + Quantization makes the upper bound inclusive. + Args: + lower (float): Lower boundary of the output interval (e.g. 1e-4) + upper (float): Upper boundary of the output interval (e.g. 1e-2) + q (float): Quantization number. The result will be rounded to an + integer increment of this value. + base (int): Base of the log. Defaults to 10. + """ + return Float(lower, upper).loguniform(base).quantized(q) + + +def choice(categories: List): + """Sample a categorical value. + Sampling from ``tune.choice([1, 2])`` is equivalent to sampling from + ``random.choice([1, 2])`` + """ + return Categorical(categories).uniform() + + +def randint(lower: int, upper: int): + """Sample an integer value uniformly between ``lower`` and ``upper``. + ``lower`` is inclusive, ``upper`` is exclusive. + Sampling from ``tune.randint(10)`` is equivalent to sampling from + ``np.random.randint(10)`` + """ + return Integer(lower, upper).uniform() + + +def lograndint(lower: int, upper: int, base: float = 10): + """Sample an integer value log-uniformly between ``lower`` and ``upper``, + with ``base`` being the base of logarithm. + ``lower`` is inclusive, ``upper`` is exclusive. + """ + return Integer(lower, upper).loguniform(base) + + +def qrandint(lower: int, upper: int, q: int = 1): + """Sample an integer value uniformly between ``lower`` and ``upper``. + ``lower`` is inclusive, ``upper`` is also inclusive (!). + The value will be quantized, i.e. rounded to an integer increment of ``q``. + Quantization makes the upper bound inclusive. + """ + return Integer(lower, upper).uniform().quantized(q) + + +def qlograndint(lower: int, upper: int, q: int, base: float = 10): + """Sample an integer value log-uniformly between ``lower`` and ``upper``, + with ``base`` being the base of logarithm. + ``lower`` is inclusive, ``upper`` is also inclusive (!). + The value will be quantized, i.e. rounded to an integer increment of ``q``. + Quantization makes the upper bound inclusive. + """ + return Integer(lower, upper).loguniform(base).quantized(q) + + +def randn(mean: float = 0., sd: float = 1.): + """Sample a float value normally with ``mean`` and ``sd``. + Args: + mean (float): Mean of the normal distribution. Defaults to 0. + sd (float): SD of the normal distribution. Defaults to 1. + """ + return Float(None, None).normal(mean, sd) + + +def qrandn(mean: float, sd: float, q: float): + """Sample a float value normally with ``mean`` and ``sd``. + The value will be quantized, i.e. rounded to an integer increment of ``q``. + Args: + mean (float): Mean of the normal distribution. + sd (float): SD of the normal distribution. + q (float): Quantization number. The result will be rounded to an + integer increment of this value. + """ + return Float(None, None).normal(mean, sd).quantized(q) \ No newline at end of file diff --git a/flaml/tune/trial.py b/flaml/tune/trial.py new file mode 100644 index 000000000..77ade0329 --- /dev/null +++ b/flaml/tune/trial.py @@ -0,0 +1,143 @@ +''' +Copyright 2020 The Ray Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This source file is adapted here because ray does not fully support Windows. +''' +import uuid +import time +from numbers import Number +from collections import deque +import copy + + +def flatten_dict(dt, delimiter="/", prevent_delimiter=False): + dt = copy.deepcopy(dt) + if prevent_delimiter and any(delimiter in key for key in dt): + # Raise if delimiter is any of the keys + raise ValueError( + "Found delimiter `{}` in key when trying to flatten array." + "Please avoid using the delimiter in your specification.") + while any(isinstance(v, dict) for v in dt.values()): + remove = [] + add = {} + for key, value in dt.items(): + if isinstance(value, dict): + for subkey, v in value.items(): + if prevent_delimiter and delimiter in subkey: + # Raise if delimiter is in any of the subkeys + raise ValueError( + "Found delimiter `{}` in key when trying to " + "flatten array. Please avoid using the delimiter " + "in your specification.") + add[delimiter.join([key, str(subkey)])] = v + remove.append(key) + dt.update(add) + for k in remove: + del dt[k] + return dt + + +def unflatten_dict(dt, delimiter="/"): + """Unflatten dict. Does not support unflattening lists.""" + dict_type = type(dt) + out = dict_type() + for key, val in dt.items(): + path = key.split(delimiter) + item = out + for k in path[:-1]: + item = item.setdefault(k, dict_type()) + item[path[-1]] = val + return out + + +class Trial: + """A trial object holds the state for one model training run. + Trials are themselves managed by the TrialRunner class, which implements + the event loop for submitting trial runs to a Ray cluster. + Trials start in the PENDING state, and transition to RUNNING once started. + On error it transitions to ERROR, otherwise TERMINATED on success. + Attributes: + trainable_name (str): Name of the trainable object to be executed. + config (dict): Provided configuration dictionary with evaluated params. + trial_id (str): Unique identifier for the trial. + local_dir (str): Local_dir as passed to tune.run. + logdir (str): Directory where the trial logs are saved. + evaluated_params (dict): Evaluated parameters by search algorithm, + experiment_tag (str): Identifying trial name to show in the console. + resources (Resources): Amount of resources that this trial will use. + status (str): One of PENDING, RUNNING, PAUSED, TERMINATED, ERROR/ + error_file (str): Path to the errors that this trial has raised. + """ + + PENDING = "PENDING" + RUNNING = "RUNNING" + PAUSED = "PAUSED" + TERMINATED = "TERMINATED" + ERROR = "ERROR" + + @classmethod + def generate_id(cls): + return str(uuid.uuid1().hex)[:8] + + def update_last_result(self, result): + if self.experiment_tag: + result.update(experiment_tag=self.experiment_tag) + + self.last_result = result + self.last_update_time = time.time() + + for metric, value in flatten_dict(result).items(): + if isinstance(value, Number): + if metric not in self.metric_analysis: + self.metric_analysis[metric] = { + "max": value, + "min": value, + "avg": value, + "last": value + } + self.metric_n_steps[metric] = {} + for n in self.n_steps: + key = "last-{:d}-avg".format(n) + self.metric_analysis[metric][key] = value + # Store n as string for correct restore. + self.metric_n_steps[metric][str(n)] = deque( + [value], maxlen=n) + else: + step = result["training_iteration"] or 1 + self.metric_analysis[metric]["max"] = max( + value, self.metric_analysis[metric]["max"]) + self.metric_analysis[metric]["min"] = min( + value, self.metric_analysis[metric]["min"]) + self.metric_analysis[metric]["avg"] = 1 / step * ( + value + + (step - 1) * self.metric_analysis[metric]["avg"]) + self.metric_analysis[metric]["last"] = value + + for n in self.n_steps: + key = "last-{:d}-avg".format(n) + self.metric_n_steps[metric][str(n)].append(value) + self.metric_analysis[metric][key] = sum( + self.metric_n_steps[metric][str(n)]) / len( + self.metric_n_steps[metric][str(n)]) + + def set_status(self, status): + """Sets the status of the trial.""" + self.status = status + if status == Trial.RUNNING: + if self.start_time is None: + self.start_time = time.time() + + def is_finished(self): + return self.status in [Trial.ERROR, Trial.TERMINATED] diff --git a/flaml/tune/trial_runner.py b/flaml/tune/trial_runner.py new file mode 100644 index 000000000..0456c7863 --- /dev/null +++ b/flaml/tune/trial_runner.py @@ -0,0 +1,121 @@ +'''! + * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the + * project root for license information. +''' +from typing import Optional +try: + from ray.tune.trial import Trial +except: + from .trial import Trial +import logging +logger = logging.getLogger(__name__) + + +class Nologger(): + '''Logger without logging + ''' + + def on_result(self, result): pass + + +class SimpleTrial(Trial): + '''A simple trial class + ''' + + def __init__(self, config, trial_id = None): + self.trial_id = Trial.generate_id() if trial_id is None else trial_id + self.config = config or {} + self.status = Trial.PENDING + self.start_time = None + self.last_result = {} + self.last_update_time = -float("inf") + self.custom_trial_name = None + self.trainable_name = "trainable" + self.experiment_tag = "exp" + self.verbose = False + self.result_logger = Nologger() + self.metric_analysis = {} + self.n_steps = [5, 10] + self.metric_n_steps = {} + + +class BaseTrialRunner: + """Implementation of a simple trial runner + + Note that the caller usually should not mutate trial state directly. + """ + + def __init__(self, + search_alg = None, + scheduler = None, + metric: Optional[str] = None, + mode: Optional[str] = 'min'): + self._search_alg = search_alg + self._scheduler_alg = scheduler + self._trials = [] + self._metric = metric + self._mode = mode + + def get_trials(self): + """Returns the list of trials managed by this TrialRunner. + + Note that the caller usually should not mutate trial state directly. + """ + return self._trials + + def add_trial(self, trial): + """Adds a new trial to this TrialRunner. + + Trials may be added at any time. + + Args: + trial (Trial): Trial to queue. + """ + self._trials.append(trial) + if self._scheduler_alg: + self._scheduler_alg.on_trial_add(self, trial) + + def process_trial_result(self, trial, result): + trial.update_last_result(result) + self._search_alg.on_trial_result(trial.trial_id, result) + if self._scheduler_alg: + decision = self._scheduler_alg.on_trial_result(self, trial, result) + if decision == "STOP": trial.set_status(Trial.TERMINATED) + elif decision == "PAUSE": trial.set_status(Trial.PAUSED) + + def stop_trial(self, trial): + """Stops trial. + """ + if not trial.status in [Trial.ERROR, Trial.TERMINATED]: + if self._scheduler_alg: + self._scheduler_alg.on_trial_complete(self, + trial.trial_id, trial.last_result) + self._search_alg.on_trial_complete( + trial.trial_id, trial.last_result) + trial.set_status(Trial.TERMINATED) + else: + if self._scheduler_alg: + self._scheduler_alg.on_trial_remove(self, trial) + + +class SequentialTrialRunner(BaseTrialRunner): + """Implementation of the sequential trial runner + """ + + def step(self) -> Trial: + """Runs one step of the trial event loop. + Callers should typically run this method repeatedly in a loop. They + may inspect or modify the runner's state in between calls to step(). + + returns a Trial to run + """ + trial_id = Trial.generate_id() + config = self._search_alg.suggest(trial_id) + if config: + trial = SimpleTrial(config, trial_id) + self.add_trial(trial) + trial.set_status(Trial.RUNNING) + else: trial = None + self.running_trial = trial + return trial diff --git a/flaml/tune/tune.py b/flaml/tune/tune.py new file mode 100644 index 000000000..7ce35e85d --- /dev/null +++ b/flaml/tune/tune.py @@ -0,0 +1,295 @@ +'''! + * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the + * project root for license information. +''' +from typing import Optional, Union +import datetime, time +try: + from ray.tune.analysis import ExperimentAnalysis as EA +except: + from .analysis import ExperimentAnalysis as EA + +import logging +logger = logging.getLogger(__name__) + + +_use_ray = True +_runner = None +_verbose = 0 + + +class ExperimentAnalysis(EA): + '''Class for storing the experiment results + ''' + + def __init__(self, trials, metric, mode): + try: + super().__init__(self, None, trials, metric, mode) + except: + self.trials = trials + self.default_metric = metric + self.default_mode = mode + + +def report(_metric=None, **kwargs): + '''A function called by the HPO application to report final or intermediate + results. + + Example: + + .. code-block:: python + + import time + from flaml import tune + + def compute_with_config(config): + current_time = time.time() + metric2minimize = (round(config['x'])-95000)**2 + time2eval = time.time() - current_time + tune.report(metric2minimize=metric2minimize, time2eval=time2eval) + + analysis = tune.run( + compute_with_config, + init_config={}, + config={ + 'x': tune.qloguniform(lower=1, upper=1000000, q=1), + 'y': tune.randint(lower=1, upper=1000000) + }, + metric='metric2minimize', mode='min', + num_samples=1000000, time_budget_s=60, use_ray=False) + + print(analysis.trials[-1].last_result) + + Args: + _metric: Optional default anonymous metric for ``tune.report(value)``. + (For compatibility with ray.tune.report) + **kwargs: Any key value pair to be reported. + ''' + global _use_ray + global _verbose + if _use_ray: + from ray import tune + return tune.report(_metric, **kwargs) + else: + result = kwargs + if _verbose == 2: + logger.info(f"result: {kwargs}") + if _metric: result['_default_anonymous_metric'] = _metric + trial = _runner.running_trial + result['config'] = trial.config + for key, value in trial.config.items(): + result['config/'+key] = value + _runner.process_trial_result(_runner.running_trial, result) + result['time_total_s'] = trial.last_update_time - trial.start_time + if _verbose > 2: + logger.info(f"result: {result}") + if _runner.running_trial.is_finished(): + return None + else: return True + + +def run(training_function, + init_config: dict, + config: Optional[dict] = None, + cat_hp_cost: Optional[dict] = None, + metric: Optional[str] = None, + mode: Optional[str] = None, + time_budget_s: Union[int, float, datetime.timedelta] = None, + prune_attr: Optional[str] = None, + min_resource: Optional[float] = None, + max_resource: Optional[float] = None, + reduction_factor: Optional[float] = None, + report_intermediate_result: Optional[bool] = False, + search_alg = None, + verbose: Optional[int] = 2, + local_dir: Optional[str] = None, + num_samples: Optional[int] = 1, + resources_per_trial: Optional[dict] = None, + mem_size = None, + use_ray: Optional[bool] = False, + ): + '''The trigger for HPO. + + Example: + + .. code-block:: python + + import time + from flaml import tune + + def compute_with_config(config): + current_time = time.time() + metric2minimize = (round(config['x'])-95000)**2 + time2eval = time.time() - current_time + tune.report(metric2minimize=metric2minimize, time2eval=time2eval) + + analysis = tune.run( + compute_with_config, + init_config={}, + config={ + 'x': tune.qloguniform(lower=1, upper=1000000, q=1), + 'y': tune.randint(lower=1, upper=1000000) + }, + metric='metric2minimize', mode='min', + num_samples=-1, time_budget_s=60, use_ray=False) + + print(analysis.trials[-1].last_result) + + Args: + training_function: A user-defined training function. + init_config: A dictionary from a subset of controlled dimensions + to the initial low-cost values. e.g., + + .. code-block:: python + + {'epochs': 1} + + If no such dimension, pass an empty dict {}. + config: A dictionary to specify the search space. + cat_hp_cost: A dictionary from a subset of categorical dimensions + to the relative cost of each choice. + e.g., + + .. code-block:: python + + {'tree_method': [1, 1, 2]} + + i.e., the relative cost of the + three choices of 'tree_method' is 1, 1 and 2 respectively + metric: A string of the metric name to optimize for. + mode: A string in ['min', 'max'] to specify the objective as + minimization or maximization. + time_budget_s: A float of the time budget in seconds. + prune_attr: A string of the attribute used for pruning. + Not necessarily in space. + When prune_attr is in space, it is a hyperparameter, e.g., + 'n_iters', and the best value is unknown. + When prune_attr is not in space, it is a resource dimension, + e.g., 'sample_size', and the peak performance is assumed + to be at the max_resource. + min_resource: A float of the minimal resource to use for the + prune_attr; only valid if prune_attr is not in space. + max_resource: A float of the maximal resource to use for the + prune_attr; only valid if prune_attr is not in space. + reduction_factor: A float of the reduction factor used for incremental + pruning. + report_intermediate_result: A boolean of whether intermediate results + are reported. If so, early stopping and pruning can be used. + search_alg: An instance of BlendSearch as the search algorithm + to be used. The same instance can be used for iterative tuning. + e.g., + + .. code-block:: python + + from flaml import BlendSearch + algo = BlendSearch(metric='val_loss', mode='min', + space=search_space, + points_to_evaluate=points_to_evaluate) + for i in range(10): + analysis = tune.run(compute_with_config, init_config=None, + search_alg=algo, use_ray=False) + print(analysis.trials[-1].last_result) + + verbose: 0, 1, 2, or 3. Verbosity mode for ray if ray backend is used. + 0 = silent, 1 = only status updates, 2 = status and brief trial + results, 3 = status and detailed trial results. Defaults to 2. + local_dir: A string of the local dir to save ray logs if ray backend is + used. + num_samples: An integer of the number of configs to try. Defaults to 1. + resources_per_trial: A dictionary of the hardware resources to allocate + per trial, e.g., `{'mem': 1024**3}`. When not using ray backend, + only 'mem' is used as approximate resource constraints + (in conjunction with mem_size). + mem_size: A function to estimate the memory size for a given config. + It is used to skip configs which do not fit in memory. + use_ray: A boolean of whether to use ray as the backend + ''' + global _use_ray + global _verbose + if not use_ray: + _verbose = verbose + if verbose > 0: + import os + os.makedirs(local_dir, exist_ok=True) + logger.addHandler(logging.FileHandler(local_dir+'/tune_'+str( + datetime.datetime.now())+'.log')) + if verbose<=2: + logger.setLevel(logging.INFO) + else: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.CRITICAL) + + if search_alg is None: + from ..searcher.blendsearch import BlendSearch + search_alg = BlendSearch(points_to_evaluate=[init_config], + metric=metric, mode=mode, + cat_hp_cost=cat_hp_cost, + space=config, prune_attr=prune_attr, + min_resource=min_resource, + max_resource=max_resource, + reduction_factor=reduction_factor, + resources_per_trial=resources_per_trial, + mem_size=mem_size) + if time_budget_s: + search_alg.set_search_properties(metric, mode, config={ + 'time_budget_s':time_budget_s}) + if report_intermediate_result: + params = {} + # scheduler resource_dimension=prune_attr + if prune_attr: params['time_attr'] = prune_attr + if max_resource: params['max_t'] = max_resource + if min_resource: params['grace_period'] = min_resource + if reduction_factor: params['reduction_factor'] = reduction_factor + try: + from ray.tune.schedulers import ASHAScheduler + scheduler = ASHAScheduler(**params) + except: + scheduler = None + else: + scheduler = None + + if use_ray: + try: + from ray import tune + except: + raise ImportError("Failed to import ray tune. " + "Please install ray[tune] or set use_ray=False") + _use_ray = True + return tune.run(training_function, + metric=metric, + mode=mode, + search_alg=search_alg, + scheduler=scheduler, + time_budget_s=time_budget_s, + verbose=verbose, + local_dir=local_dir, + num_samples=num_samples, + resources_per_trial=resources_per_trial + ) + + # simple sequential run without using tune.run() from ray + time_start = time.time() + _use_ray = False + if scheduler: + scheduler.set_search_properties(metric=metric, mode=mode) + from .trial_runner import SequentialTrialRunner + global _runner + _runner = SequentialTrialRunner( + search_alg=search_alg, + scheduler=scheduler, + metric=metric, + mode=mode, + ) + num_trials = 0 + while time.time()-time_start=3.6`. To run this notebook example, please install flaml with the `notebook` and `azureml` option:\n", + "```bash\n", + "pip install flaml[notebook,azureml]\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow\n", + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## 2. Real Data Example\n", + "### Load data and preprocess\n", + "\n", + "Download [Airlines dataset](https://www.openml.org/d/1169) from OpenML. The task is to predict whether a given flight will be delayed, given the information of the scheduled departure." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "slideshow": { + "slide_type": "subslide" + }, + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "load dataset from ./openml_ds1169.pkl\n", + "Dataset name: airlines\n", + "X_train.shape: (404537, 7), y_train.shape: (404537,);\n", + "X_test.shape: (134846, 7), y_test.shape: (134846,)\n" + ] + } + ], + "source": [ + "from flaml.data import load_openml_dataset\n", + "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id = 1169, data_dir = './')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Run FLAML\n", + "In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. For example, the default ML learners of FLAML are `['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree', 'lrl1']`. " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "''' import AutoML class from flaml package '''\n", + "from flaml import AutoML\n", + "automl = AutoML()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "settings = {\n", + " \"time_budget\": 60, # total running time in seconds\n", + " \"metric\": 'accuracy', # primary metrics can be chosen from: ['accuracy','roc_auc','f1','log_loss','mae','mse','r2']\n", + " \"estimator_list\": ['lgbm', 'rf', 'xgboost'], # list of ML learners\n", + " \"task\": 'classification', # task type \n", + " \"sample\": False, # whether to subsample training data\n", + " \"log_file_name\": 'airlines_experiment.log', # cache directory of flaml log files \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[flaml.automl: 02-05 13:30:29] {820} INFO - Evaluation method: holdout\n", + "[flaml.automl: 02-05 13:30:29] {545} INFO - Using StratifiedKFold\n", + "[flaml.automl: 02-05 13:30:29] {841} INFO - Minimizing error metric: 1-accuracy\n", + "[flaml.automl: 02-05 13:30:29] {861} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost']\n", + "[flaml.automl: 02-05 13:30:29] {920} INFO - iteration 0 current learner lgbm\n", + "[flaml.automl: 02-05 13:30:32] {1074} INFO - at 0.9s,\tbest lgbm's error=0.3771,\tbest lgbm's error=0.3771\n", + "[flaml.automl: 02-05 13:30:32] {920} INFO - iteration 1 current learner lgbm\n", + "[flaml.automl: 02-05 13:30:32] {1074} INFO - at 3.3s,\tbest lgbm's error=0.3771,\tbest lgbm's error=0.3771\n", + "[flaml.automl: 02-05 13:30:32] {920} INFO - iteration 2 current learner lgbm\n", + "[flaml.automl: 02-05 13:30:35] {1074} INFO - at 3.8s,\tbest lgbm's error=0.3751,\tbest lgbm's error=0.3751\n", + "[flaml.automl: 02-05 13:30:35] {920} INFO - iteration 3 current learner xgboost\n", + "[flaml.automl: 02-05 13:30:35] {1074} INFO - at 6.6s,\tbest xgboost's error=0.3753,\tbest lgbm's error=0.3751\n", + "[flaml.automl: 02-05 13:30:35] {920} INFO - iteration 4 current learner lgbm\n", + "[flaml.automl: 02-05 13:30:36] {1074} INFO - at 7.3s,\tbest lgbm's error=0.3751,\tbest lgbm's error=0.3751\n", + "[flaml.automl: 02-05 13:30:36] {920} INFO - iteration 5 current learner lgbm\n", + "[flaml.automl: 02-05 13:30:39] {1074} INFO - at 8.6s,\tbest lgbm's error=0.3558,\tbest lgbm's error=0.3558\n", + "[flaml.automl: 02-05 13:30:39] {920} INFO - iteration 6 current learner lgbm\n", + "[flaml.automl: 02-05 13:30:42] {1074} INFO - at 11.4s,\tbest lgbm's error=0.3492,\tbest lgbm's error=0.3492\n", + "[flaml.automl: 02-05 13:30:42] {920} INFO - iteration 7 current learner lgbm\n", + "[flaml.automl: 02-05 13:30:43] {1074} INFO - at 14.1s,\tbest lgbm's error=0.3492,\tbest lgbm's error=0.3492\n", + "[flaml.automl: 02-05 13:30:43] {920} INFO - iteration 8 current learner lgbm\n", + "[flaml.automl: 02-05 13:30:46] {1074} INFO - at 15.1s,\tbest lgbm's error=0.3470,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 02-05 13:30:46] {920} INFO - iteration 9 current learner xgboost\n", + "[flaml.automl: 02-05 13:30:46] {1074} INFO - at 17.6s,\tbest xgboost's error=0.3753,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 02-05 13:30:46] {920} INFO - iteration 10 current learner xgboost\n", + "[flaml.automl: 02-05 13:30:47] {1074} INFO - at 18.5s,\tbest xgboost's error=0.3753,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 02-05 13:30:47] {920} INFO - iteration 11 current learner rf\n", + "[flaml.automl: 02-05 13:30:52] {1074} INFO - at 22.8s,\tbest rf's error=0.3861,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 02-05 13:30:52] {920} INFO - iteration 12 current learner rf\n", + "[flaml.automl: 02-05 13:30:55] {1074} INFO - at 26.7s,\tbest rf's error=0.3861,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 02-05 13:30:55] {920} INFO - iteration 13 current learner rf\n", + "[flaml.automl: 02-05 13:31:01] {1074} INFO - at 32.0s,\tbest rf's error=0.3861,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 02-05 13:31:01] {920} INFO - iteration 14 current learner xgboost\n", + "[flaml.automl: 02-05 13:31:02] {1074} INFO - at 32.9s,\tbest xgboost's error=0.3750,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 02-05 13:31:02] {920} INFO - iteration 15 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:03] {1074} INFO - at 34.2s,\tbest lgbm's error=0.3470,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 02-05 13:31:03] {920} INFO - iteration 16 current learner rf\n", + "[flaml.automl: 02-05 13:31:09] {1074} INFO - at 40.2s,\tbest rf's error=0.3861,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 02-05 13:31:09] {920} INFO - iteration 17 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:10] {1074} INFO - at 41.4s,\tbest lgbm's error=0.3470,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 02-05 13:31:10] {920} INFO - iteration 18 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:12] {1074} INFO - at 42.8s,\tbest lgbm's error=0.3470,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 02-05 13:31:12] {920} INFO - iteration 19 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:15] {1074} INFO - at 44.4s,\tbest lgbm's error=0.3412,\tbest lgbm's error=0.3412\n", + "[flaml.automl: 02-05 13:31:15] {920} INFO - iteration 20 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:19] {1074} INFO - at 48.1s,\tbest lgbm's error=0.3374,\tbest lgbm's error=0.3374\n", + "[flaml.automl: 02-05 13:31:19] {920} INFO - iteration 21 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:19] {1074} INFO - at 50.5s,\tbest lgbm's error=0.3374,\tbest lgbm's error=0.3374\n", + "[flaml.automl: 02-05 13:31:19] {920} INFO - iteration 22 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:25] {1074} INFO - at 54.7s,\tbest lgbm's error=0.3311,\tbest lgbm's error=0.3311\n", + "[flaml.automl: 02-05 13:31:25] {920} INFO - iteration 23 current learner rf\n", + "[flaml.automl: 02-05 13:31:29] {1074} INFO - at 60.2s,\tbest rf's error=0.3861,\tbest lgbm's error=0.3311\n", + "[flaml.automl: 02-05 13:31:29] {1114} INFO - selected model: LGBMClassifier(colsample_bytree=0.9997863921359742,\n", + " learning_rate=0.1564464373197609, max_bin=511,\n", + " min_child_weight=7.427173668000723, n_estimators=18,\n", + " num_leaves=1846, objective='binary',\n", + " reg_alpha=6.349231150788211e-09, reg_lambda=0.8927146483558472)\n", + "[flaml.automl: 02-05 13:31:29] {875} INFO - fit succeeded\n" + ] + } + ], + "source": [ + "'''The main flaml automl API'''\n", + "mlflow.set_experiment(\"flaml\")\n", + "with mlflow.start_run() as run:\n", + " automl.fit(X_train = X_train, y_train = y_train, **settings)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Best model and metric" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Best ML leaner: lgbm\nBest hyperparmeter config: {'n_estimators': 18.0, 'max_leaves': 1846.0, 'min_child_weight': 7.427173668000723, 'learning_rate': 0.1564464373197609, 'subsample': 1.0, 'log_max_bin': 9.0, 'colsample_bytree': 0.9997863921359742, 'reg_alpha': 6.349231150788211e-09, 'reg_lambda': 0.8927146483558472}\nBest accuracy on validation data: 0.6689\nTraining duration of best run: 4.18 s\n" + ] + } + ], + "source": [ + "''' retrieve best config and best learner'''\n", + "print('Best ML leaner:', automl.best_estimator)\n", + "print('Best hyperparmeter config:', automl.best_config)\n", + "print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))\n", + "print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "LGBMClassifier(colsample_bytree=0.9997863921359742,\n", + " learning_rate=0.1564464373197609, max_bin=511,\n", + " min_child_weight=7.427173668000723, n_estimators=18,\n", + " num_leaves=1846, objective='binary',\n", + " reg_alpha=6.349231150788211e-09, reg_lambda=0.8927146483558472)" + ] + }, + "metadata": {}, + "execution_count": 22 + } + ], + "source": [ + "automl.model" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "''' pickle and save the best model '''\n", + "import pickle\n", + "with open('best_model.pkl', 'wb') as f:\n", + " pickle.dump(automl.model, f, pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Predicted labels [1 0 1 ... 1 0 0]\nTrue labels [0 0 0 ... 0 1 0]\n" + ] + } + ], + "source": [ + "''' compute predictions of testing dataset ''' \n", + "y_pred = automl.predict(X_test)\n", + "print('Predicted labels', y_pred)\n", + "print('True labels', y_test)\n", + "y_pred_proba = automl.predict_proba(X_test)[:,1]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "accuracy = 0.6681918633107397\nroc_auc = 0.7208412179342409\nlog_loss = 0.6064652793713222\nf1 = 0.5838518559855651\n" + ] + } + ], + "source": [ + "''' compute different metric values on testing dataset'''\n", + "from flaml.ml import sklearn_metric_loss_score\n", + "print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))\n", + "print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))\n", + "print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))\n", + "print('f1', '=', 1 - sklearn_metric_loss_score('f1', y_pred, y_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Log history" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "slideshow": { + "slide_type": "subslide" + }, + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 4, 'max_leaves': 4, 'min_child_weight': 20.0, 'learning_rate': 0.1, 'subsample': 1.0, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 1e-10, 'reg_lambda': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 4, 'max_leaves': 4, 'min_child_weight': 20.0, 'learning_rate': 0.1, 'subsample': 1.0, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 1e-10, 'reg_lambda': 1.0}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 4.0, 'max_leaves': 4.0, 'min_child_weight': 20.0, 'learning_rate': 0.46335414315327306, 'subsample': 0.9339389930838808, 'log_max_bin': 10.0, 'colsample_bytree': 0.9904286645657556, 'reg_alpha': 2.841147337412889e-10, 'reg_lambda': 0.12000833497054482}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 4.0, 'max_leaves': 4.0, 'min_child_weight': 20.0, 'learning_rate': 0.46335414315327306, 'subsample': 0.9339389930838808, 'log_max_bin': 10.0, 'colsample_bytree': 0.9904286645657556, 'reg_alpha': 2.841147337412889e-10, 'reg_lambda': 0.12000833497054482}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 23.0, 'max_leaves': 4.0, 'min_child_weight': 20.0, 'learning_rate': 1.0, 'subsample': 0.9917683183663918, 'log_max_bin': 10.0, 'colsample_bytree': 0.9858892907525497, 'reg_alpha': 3.8783982645515837e-10, 'reg_lambda': 0.36607431863072826}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 23.0, 'max_leaves': 4.0, 'min_child_weight': 20.0, 'learning_rate': 1.0, 'subsample': 0.9917683183663918, 'log_max_bin': 10.0, 'colsample_bytree': 0.9858892907525497, 'reg_alpha': 3.8783982645515837e-10, 'reg_lambda': 0.36607431863072826}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 11.0, 'max_leaves': 17.0, 'min_child_weight': 14.947587304572773, 'learning_rate': 0.6092558236172073, 'subsample': 0.9659256891661986, 'log_max_bin': 10.0, 'colsample_bytree': 1.0, 'reg_alpha': 3.816590663384559e-08, 'reg_lambda': 0.4482946615262561}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 11.0, 'max_leaves': 17.0, 'min_child_weight': 14.947587304572773, 'learning_rate': 0.6092558236172073, 'subsample': 0.9659256891661986, 'log_max_bin': 10.0, 'colsample_bytree': 1.0, 'reg_alpha': 3.816590663384559e-08, 'reg_lambda': 0.4482946615262561}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 7.0, 'max_leaves': 51.0, 'min_child_weight': 20.0, 'learning_rate': 0.8834537640176922, 'subsample': 1.0, 'log_max_bin': 10.0, 'colsample_bytree': 0.9837052481490312, 'reg_alpha': 4.482246955743696e-08, 'reg_lambda': 0.028657570201141073}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 7.0, 'max_leaves': 51.0, 'min_child_weight': 20.0, 'learning_rate': 0.8834537640176922, 'subsample': 1.0, 'log_max_bin': 10.0, 'colsample_bytree': 0.9837052481490312, 'reg_alpha': 4.482246955743696e-08, 'reg_lambda': 0.028657570201141073}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 15.0, 'max_leaves': 165.0, 'min_child_weight': 11.09973081317571, 'learning_rate': 1.0, 'subsample': 0.9847553005974036, 'log_max_bin': 9.0, 'colsample_bytree': 0.9508927355861483, 'reg_alpha': 2.031936014930936e-06, 'reg_lambda': 0.00624701632609755}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 15.0, 'max_leaves': 165.0, 'min_child_weight': 11.09973081317571, 'learning_rate': 1.0, 'subsample': 0.9847553005974036, 'log_max_bin': 9.0, 'colsample_bytree': 0.9508927355861483, 'reg_alpha': 2.031936014930936e-06, 'reg_lambda': 0.00624701632609755}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 6.0, 'max_leaves': 1073.0, 'min_child_weight': 5.630999649172112, 'learning_rate': 0.32864729892819683, 'subsample': 1.0, 'log_max_bin': 10.0, 'colsample_bytree': 0.99236562733598, 'reg_alpha': 1.978160373587824e-09, 'reg_lambda': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 6.0, 'max_leaves': 1073.0, 'min_child_weight': 5.630999649172112, 'learning_rate': 0.32864729892819683, 'subsample': 1.0, 'log_max_bin': 10.0, 'colsample_bytree': 0.99236562733598, 'reg_alpha': 1.978160373587824e-09, 'reg_lambda': 1.0}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 18.0, 'max_leaves': 1846.0, 'min_child_weight': 7.427173668000723, 'learning_rate': 0.1564464373197609, 'subsample': 1.0, 'log_max_bin': 9.0, 'colsample_bytree': 0.9997863921359742, 'reg_alpha': 6.349231150788211e-09, 'reg_lambda': 0.8927146483558472}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 18.0, 'max_leaves': 1846.0, 'min_child_weight': 7.427173668000723, 'learning_rate': 0.1564464373197609, 'subsample': 1.0, 'log_max_bin': 9.0, 'colsample_bytree': 0.9997863921359742, 'reg_alpha': 6.349231150788211e-09, 'reg_lambda': 0.8927146483558472}}\n" + ] + } + ], + "source": [ + "from flaml.data import get_output_from_log\n", + "time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \\\n", + " get_output_from_log(filename = settings['log_file_name'], time_budget = 60)\n", + "\n", + "for config in config_history:\n", + " print(config)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/svg+xml": "\r\n\r\n\r\n\r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\r\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8GearUAAAgAElEQVR4nO3deZxU5Zn3/8+XZmt2sEHZFFDADRUlLol7oqAxLtGo8cky5jcaZ7KYmJjRzGSbjM8k4ck60ThqjFlcogYVE5a4a4wbBKRBQBGVtVmEZmmapbuv3x91Gou2uimgq6ur6vt+vfpFn/vcp+o6InXVuVdFBGZmZk11yHcAZmbWPjlBmJlZRk4QZmaWkROEmZll5ARhZmYZOUGYmVlGThBme0HSKZIW5jsOs1xygrCCI+ltSR/JZwwR8VxEjM7V60saL+lZSZskrZH0jKTzc/V+Zpk4QZhlIKksj+99CfAA8DtgCLA/8G3gY3vxWpLkf+e2V/w/jhUNSR0k3SDpTUnvSrpfUr+08w9IqpK0Ifl2fkTaubsk/UrSFEk1wBnJk8rXJc1JrvmjpK5J/dMlLUu7vtm6yflvSFopaYWkf5YUkg7JcA8CfgJ8PyLuiIgNEdEQEc9ExFVJne9K+kPaNcOS1+uYHD8t6SZJzwNbgG9KmtHkfb4qaXLyexdJ/0/SEkmrJN0qqXwf/zqsCDhBWDH5MnAhcBowCFgP3Jx2fiowEhgA/AO4u8n1VwA3AT2BvyVllwITgOHAUcA/tfD+GetKmgBcB3wEOCSJrzmjgaHAgy3UycangatJ3cv/AKMljUw7fwVwT/L7D4FRwDFJfINJPbFYiXOCsGLyeeDfI2JZRGwDvgtc0vjNOiLujIhNaeeOltQ77fpHIuL55Bv71qTsFxGxIiLWAY+S+hBtTnN1LwV+ExHzImIL8L0WXmO/5M+VWd91Zncl71cXERuAR4BPAiSJ4lBgcvLEchXw1YhYFxGbgP8LXL6P729FwAnCislBwEOSqiVVA/OBemB/SWWSfpA0P20E3k6uqUi7fmmG16xK+30L0KOF92+u7qAmr53pfRq9m/w5sIU62Wj6HveQJAhSTw8PJ8mqP9ANmJn2321aUm4lzgnCislS4JyI6JP20zUilpP6ULyAVDNPb2BYco3Srs/V0sYrSXU2NxraQt2FpO7j4hbq1JD6UG90QIY6Te/lr0CFpGNIJYrG5qW1QC1wRNp/s94R0VIitBLhBGGFqpOkrmk/HYFbgZskHQQgqb+kC5L6PYFtpL6hdyPVjNJW7geulHSYpG600L4fqfX3rwO+JelKSb2SzveTJd2WVJsNnCrpwKSJ7MbdBRARdaT6NSYC/YDHkvIG4Hbgp5IGAEgaLGn8Xt+tFQ0nCCtUU0h98238+S7wc2Ay8FdJm4AXgROS+r8D3gGWA68l59pEREwFfgE8BSwCXkhObWum/oPAZcDngBXAKuC/SPUjEBGPAX8E5gAzgT9nGco9pJ6gHkgSRqN/S+J6MWl+e5xUZ7mVOHnDILO2JekwYC7QpckHtVm74icIszYg6SJJnSX1JTWs9FEnB2vvnCDM2sbngTXAm6RGVv1LfsMx2z03MZmZWUZ+gjAzs4w65juA1lRRURHDhg3LdxhmZgVj5syZayMi48TIokoQw4YNY8aMGbuvaGZmAEh6p7lzOW1ikjRB0kJJiyTd0Eyd0yXNljRP0jNJ2eikrPFno6Sv5DJWMzPbVc6eIJL19G8GzgKWAa9ImhwRr6XV6QPcAkyIiCWNMzkjYiHJQmfJ6ywHHspVrGZm9n65fII4HlgUEYsjYjtwH6m1cNJdAUyKiCUAEbE6w+t8GHgzIpp9DDIzs9aXywQxmF1XlFyWlKUbBfRNNjiZKekzGV7ncuDeHMVoZmbNyGUntTKUNZ100RE4jtRTQjnwgqQXI+J1AEmdgfNpYTEySVeT2hiFAw88sBXCNjMzyG2CWMauyxoPIbXwWNM6ayOiBqiR9CxwNPB6cv4c4B8Rsaq5N4mI24DbAMaNG+dZf2ZWMh6etZyJ0xeyorqWQX3KuX78aC4c27ShZu/lsonpFWCkpOHJk8DlpFbaTPcIcIqkjskyyCeQ2uSl0Sdx85KZ2fs8PGs5N06qZHl1LQEsr67lxkmVPDxreau9R84SRLIQ2ReB6aQ+9O+PiHmSrpF0TVJnPqndq+YALwN3RMRcgCRhnAVMylWMZmaFauL0hdTuqN+lrHZHPROnL2y198jpRLmImEJq3f70slubHE8ktYlJ02u38N7+vGZmlmZ5dW3G8hXNlO8Nr8VkZlZgVlTX0rks88f3oD7lrfY+ThBmZgVk2twqzvn5c0jQqWzXwaLlncq4fnzrbQZYVGsxmZkVq9rt9fzXX17j7peWMGZwb37xybG8urQ6p6OYnCDMzNq5BVUb+dI9s3hj9WY+f+oIvnb2aDp37MDwiu6tmhCacoIwM2unIoLfv/gO//WX+fTq2onffe54Th2VcWXunHCCMDNrh9bVbOcbD87h8fmrOGN0fyZ+4mgqenRp0xicIMzM2pm/v7mWr/5xNutrdvDt8w7nyg8NQ8q0elFuOUGYmbUTO+ob+Nnjr3PL028yvKI7v/7sBzhycO+8xeMEYWbWDixdt4Uv3TuL2UuruWzcUL5z/uF065zfj2gnCDOzPHtk9nL+46G5IPjlFWM576hB+Q4JcIIwM8ubmm11fGfyPB6cuYzjDurLzy8/hiF9u+U7rJ2cIMzM8qBy2Qa+fN8s3nm3hi9/eCRfPvMQOjazfEa+OEGYmbWhhobg1397ix9NX0BFjy7ce9WJnDCifa5L6gRhZtZGVm/aytfuf5Xn3ljLhCMO4AcXj6FPt875DqtZThBmZm3gqYWruf6BV9m0tY6bLjqSK44/MC9zG/aEE4SZWQ5tq6vnR9MW8uu/vcWhB/TknqtOZNT+PfMdVlacIMzMcuTNNZv58r2zmLdiI5896SBuPPcwunYqy3dYWXOCMDNrZRHBAzOW8Z3J8+jaqQO3f2YcZx2+f77D2mNOEGZmrWhD7Q7+/aFK/jxnJSeN2I+fXnYMB/Tumu+w9ooThJlZK5n5zjq+fO9sqjZu5frxo7nmtIMp69C+O6Jb4gRhZraP6huCW55axM+eeINBfbry4DUnMfbAvvkOa585QZiZ7YOVG2r5yn2zeemtdVxwzCC+f+GR9OraKd9htQonCDOzvTR9XhX/9qc5bK9r4MefOJqPHzu43c9t2BNOEGZmWXh41nImTl/IiupaBvbuyvCK7jz/5ruMGdybX3xyLMMruuc7xFbnBGFmthsPz1rOjZMqqd1RD8CKDVtZsWErZx46gFs/dRydO7avRfZaS3HelZlZK5o4feHO5JBuYdWmok0O4ARhZrZby6trM5avaKa8WDhBmJm14J13a+jYzFyGQX3K2ziatuUEYWbWjKcWruZj//M3OnfsQOcmm/mUdyrj+vGj8xRZ23CCMDNroqEh+MUTb/C5u15hSN9uTP/KqfzokqMY3KccAYP7lPPfHx/DhWMH5zvUnPIoJjOzNBtqd/C1+2fz+PzVfHzsYG66aAzlncsY2q9b0SeEppwgzMwSC6s2cc0fZrJ03Ra+d/4RfOakg4pq4tuecoIwMwP+PGcF33hwDt27dOS+q09k3LB++Q4p75wgzKyk1dU38MNpC7j9ubcYd1Bfbvk/xzKgV2Euz93actpJLWmCpIWSFkm6oZk6p0uaLWmepGfSyvtIelDSAknzJZ2Uy1jNrPSs3byNT//6ZW5/7i0+e9JB3HPViU4OaXL2BCGpDLgZOAtYBrwiaXJEvJZWpw9wCzAhIpZIGpD2Ej8HpkXEJZI6A91yFauZlZ7ZS6v5lz/MZF3Ndn78iaO5+Lgh+Q6p3cllE9PxwKKIWAwg6T7gAuC1tDpXAJMiYglARKxO6vYCTgX+KSnfDmzPYaxmVkLue3kJ335kHgN6deFP//JBjhzcO98htUu5bGIaDCxNO16WlKUbBfSV9LSkmZI+k5SPANYAv5E0S9IdkopvqUQza1Pb6uq5cdIcbphUyQkj+vHoF092cmhBLhNEprFh0eS4I3Ac8FFgPPAtSaOS8mOBX0XEWKAGaK4P42pJMyTNWLNmTasFb2bFZUV1LZf+74vc+/JSvnDGwdx15fH07d4532G1a7lsYloGDE07HgKsyFBnbUTUADWSngWOBp4DlkXES0m9B2kmQUTEbcBtAOPGjWuagMzM+Puba/nSPbPYVtfA/376OMYfcUC+QyoIuUwQrwAjJQ0HlgOXk+pzSPcI8EtJHYHOwAnATyOiStJSSaMjYiHwYXbtuzAza1bj5j7Lq2vp1bUjm7fVMaJ/D/7308dxcP8e+Q6vYOQsQUREnaQvAtOBMuDOiJgn6Zrk/K0RMV/SNGAO0ADcERFzk5f4EnB3MoJpMXBlrmI1s+LRdHOfjVvr6CD455OHOznsIUUUT6vMuHHjYsaMGfkOw8zaUENDsHLjVt5aU8PitZv54dQF1Gx//+Y+g/uU8/wNZ+YhwvZN0syIGJfpnGdSm1lBWF+zncVra1i8ZjNvra3Z5WdbXcNury/2zX1ywQnCzNqN2u31aR/8m1mclgSqt+zYWa9jB3Hgft0YUdGdU0ZWMLyiB8MrujOif3cuuvl5VmzY+r7XLvbNfXLBCcLM2lRdfQPL1tfy1tqaJAEkTwRrat73wT6wd1eGV3Tno2MG7kwAIyp6MKRvOR3LMo/S/8aEQ3fpg4DS2NwnF5wgzKzVRQRrNm3b5Qlg8ZpUMliybgs76t/r++zVtSMj+vfgxBH7JUkg9TQwrKIb3Trv+UdU454NE6cvZEV1LYP6lHP9+NElt5dDa3CCMLO9tnHrDt7eJQGkOorfWlOzS0dx544dGL5fd0YO6Mn4Iw7Y+TQwvKIHfbt1avU9Fy4cO9gJoRU4QZhZi7bV1bN03Zb3EsDORFDD2s3bdtaTYEjfcoZX9GDcQf2SBJD6GdS7nA4dSnfjnULlBGFm7xsq2pgE3lpbw7L1W2hIGw1f0aMzwyu68+FDBzA8SQIjKroztF83unYqy99NWKvbbYKQ1C8i1rVFMGaWW9kOFe3euYzh/btz9NA+XDh2MCOSJ4FhFd3pXd4pj3dgbSmbJ4iXJM0GfgNMjWKaWWdWhFpjqOiAnl1Kei9mS8kmQYwCPgJ8DvgfSX8E7oqI13MamZk1a1+Hig6v6MHQFoaKmkEWCSJ5YngMeEzSGcAfgH+V9CpwQ0S8kOMYzUrSvgwVHZ7WQbw3Q0XNILs+iP2ATwGfBlaRWkRvMnAM8AAwPJcBmhW7vRkqenYyVPTgHA4VNcvmq8ULwO+BCyNiWVr5DEm35iYss+LioaJWiLJJEKOb65iOiB+2cjxmBctDRa3YZJMg/irpExFRDSCpL3BfRIzPbWhm7VO2Q0W7dS5jeIWHilrhyiZB9G9MDgARsV7SgBzGZJZ3Hipqll2CqJd0YEQsAZB0EOC5EFbwWmOo6JC+5XTyUFErUtkkiH8H/ibpmeT4VODq3IVk1no8VNRs72UzD2KapGOBEwEBX42ItTmPzGwP7MtQ0RHJEtMeKmq2q2y/FtUDq4GuwOGSiIhncxeW2fvt7VDR95qEPFTUbE9kM1Hun4FrgSHAbFJPEi8A3v3bWt3eDBU989D+OzeZ8VBRs9aTzRPEtcAHgBcj4gxJhwLfy21YVuz2dKjoUUN6e6ioWRvLJkFsjYitkpDUJSIWSPLmrgbAw7OWN7u14x4NFe3XjRH9PVTUrD3JJkEsk9QHeJjUgn3rgRW5DcsKwcOzlu+yOfzy6lq+9sCr3PL0IjZvrfNQUbMCl80opouSX78r6SmgNzAtp1FZQbjpL/N3JodG9Q3BW2tr+NhRg3YZKjpsv+507+KhomaFpMV/sZI6AHMi4kiAiHimpfpW/CKCZ15fw23PLmZN2sihdHX1wU8uO6aNIzOz1tZigoiIBkmvps+kttK0va6BR19dwe3PLWZB1SYO6NWVXl07snFr3fvqDupTnocIzay1ZfPMPxCYJ+lloKaxMCLOz1lU1m5s2rqDe19ewp1/e5uqjVsZvX9P/t8njub8owcxpXLlLn0QAOWdyrh+vMcwmBWDbBKEh7SWoKoNW/nN829xz0tL2LStjpNG7Md/XzyG00f13zmqqHG0UnOjmMyssGXTSe1+hxKyoGojtz27mMmzV9AQwbljBvL5Uw9mzJDeGetfOHawE4JZkcpmJvUm3lu9tTPQCaiJiF65DMzaTkTwwuJ3ue3ZxTy9cA3lncr41IkH8f+dPJyh/brlOzwzy5NsniB6ph9LuhA4PmcRWc40ndR23Vmj6NSxA7c/u5jK5Ruo6NGZr589ik+deBB9unXOd7hmlmdqZjfRli+SXoyIE3MQzz4ZN25czJgxI99htEtNJ7VBamneAEZUdOeqU0dw0djBXsPIrMRImhkR4zKdy6aJ6eNphx2AcXjDoIIzcfrC901qC6Bf9848ft1pXuHUzN4nm1FMH0v7vQ54G7ggmxeXNAH4OVAG3BERP8hQ53TgZ6T6NtZGxGlJ+dvAJlJLjdc1l+EsOyuqazOWr6/Z7uRgZhll0wdx5d68sKQy4GbgLGAZ8IqkyRHxWlqdPsAtwISIWJJhr+szvDlR6xjUp5zlGZKEJ7WZWXN2u0KapN8mH+SNx30l3ZnFax8PLIqIxRGxHbiP9z95XAFMapylHRGrsw/d9sTnTh72vjJPajOzlmSzhOZREVHdeBAR64GxWVw3GFiadrwsKUs3Cugr6WlJMyV9Ju1cAH9NypvdA1vS1ZJmSJqxZs2aLMIqTa8u3UCnMnFAr64IGNynnP/++BjPYTCzZmXTB9FBUt8kMSCpX5bXZWrYbtq53RE4DvgwUA68kIyQeh34UESsSJqdHpO0INM2pxFxG3AbpEYxZRFXyZm9tJrJr67gS2cewtfO9hODmWUnmw/6HwN/l/QgqQ/4S4GbsrhuGTA07XgI799HYhmpjukaoEbSs8DRwOsRsQJSzU6SHiLVZOV9sPdQRHDTX16jokdnPn/awfkOx8wKyG6bmCLid8DFwCpgDfDxiPh9Fq/9CjBS0nBJnYHLgclN6jwCnCKpo6RuwAnAfEndJfUEkNQdOBuYm+1N2Xumz1vFK2+v56tnjaKH92Mwsz2QzTyIE4F5EfHL5LinpBMi4qWWrouIOklfBKaTGuZ6Z0TMk3RNcv7WiJgvaRowB2ggNRR2rqQRwEPJonAdgXsiwpsU7aHtdQ38YOp8Rg7owWXjhu7+AjOzNNl8pfwVcGzacU2GsowiYgowpUnZrU2OJwITm5QtJtXUZPvg7pfe4e13t/Cbf/oAHb2lp5ntoWw+NRRp63FERAPZJRbLow21O/j5E2/woUP24/TR/fMdjpkVoGwSxGJJX5bUKfm5Flic68Bs39zy1CI21O7gm+cetnP/BjOzPZFNgrgG+CCwnNSooxOAq3IZlO2bpeu28Jvn3+biY4dwxKDM+ziYme1ONkttrCY1AgkASeXAecADOYzL9sGPpi+kQwf4uuc8mNk+yKrnUlKZpHMk/Q54C7gst2HZ3pq1ZD2PvrqCq08ZwQG9u+Y7HDMrYC0+QUg6ldR6SR8FXgY+BIyIiC1tEJvtodSkuPlU9OjC1Z4UZ2b7qNknCEnLgB8AzwOHR8TFQK2TQ/s1fV4VM95Zz3WeFGdmraClJqY/kVpc7zLgY8mMZq911E6lJsUtYNT+Pbh03JB8h2NmRaDZBBER1wLDgJ8AZwCvA/0lXSqpR9uEZ9n6w4upSXE3nnuYJ8WZWato8ZMkUp6MiKtIJYsrgAtJ7Spn7cSGLTv4xZNvcPIhFZw+ypPizKx1ZN1QHRE7gEeBR5OhrtZO3Py0J8WZWevbq7aIiMi8wbG1uaXrtnDX829zybFDOHxQr3yHY2ZFxI3VBe6H0xZQ1kHeCMjMWp0TRAH7x5L1/HnOSq461ZPizKz1ZbMfxCjgeuCg9PoRcWYO47LdiAj+bzIp7vOnjsh3OGZWhLLppH4AuBW4HajPbTiWrWlzU5Pi/vvjY+juSXFmlgPZfLLURcSvch6J7dbDs5YzcfpCllfXUtZBDOzdlUu9U5yZ5Ug2fRCPSvpXSQMl9Wv8yXlktouHZy3nxkmVLK9ODSCrbwje3bydR19dkefIzKxYZfME8dnkz+vTygJww3cbmjh9IbU7dm3h217fwMTpC7lw7OA8RWVmxSyb/SCGt0Ug1rIV1ZmnnjRXbma2r7IZxdQJ+Bfg1KToaeB/k5nV1kYG9Snf2bzUtNzMLBey6YP4FXAccEvyc1xSZm3o+vGj6dJx17+u8k5lXD/eE+TMLDey6YP4QEQcnXb8pKRXcxWQZXbh2ME8uWA1k5NO6cF9yrl+/Gj3P5hZzmSTIOolHRwRbwJIGoHnQ+TFknVbGDO4N49+6eR8h2JmJSCbBHE98JSkxYBIzai+MqdR2fssr65l9tJqvjHBTUpm1jayGcX0hKSRwGhSCWJBRGzLeWS2i6mVKwE458iBeY7EzEpFswlC0pkR8aSkjzc5dbAkImJSjmOzNFPnVnHYwF4Mr+ie71DMrES09ARxGvAk8LEM5wJwgmgjVRu2MvOd9XztrFH5DsXMSkizCSIivpP8+p8R8Vb6OUmePNeGps1NmpfGuHnJzNpONvMg/pSh7MHWDsSaN2VuFaP378khA3rkOxQzKyEt9UEcChwB9G7SD9EL8O40bWT1pq288vY6rv3wyHyHYmYlpqU+iNHAeUAfdu2H2ARclcug7D3T560iAs5185KZtbGW+iAeAR6RdFJEvNCGMVmaqZUrObh/d0a6ecnM2lg2E+VmSfoCqeamnU1LEfG5nEVlALy7eRsvLn6XL5xxCJLyHY6ZlZhsOql/DxwAjAeeAYaQambaLUkTJC2UtEjSDc3UOV3SbEnzJD3T5FyZpFmS/pzN+xWbv762iobw5Dgzy49sEsQhEfEtoCYifgt8FBizu4sklQE3A+cAhwOflHR4kzp9SK0Qe35EHAF8osnLXAvMzyLGojSlciXD9uvGYQN75jsUMytB2SSIxn0fqiUdCfQGhmVx3fHAoohYHBHbgfuAC5rUuQKYFBFLACJideMJSUNIJaM7snivorO+Zjt/f/Ndzhkz0M1LZpYX2SSI2yT1Bb4FTAZeA36UxXWDgaVpx8uSsnSjgL6SnpY0U9Jn0s79DPgG0NDSm0i6WtIMSTPWrFmTRViF4bH5q6hvCM5185KZ5Uk2i/U1foN/hj3bhzrT197I8P7HAR8GyoEXJL1IKnGsjoiZkk7fTXy3AbcBjBs3runrF6wplSsZ0recIwf3yncoZlaiWpood11LF0bET3bz2suAoWnHQ4AVGeqsjYgaoEbSs8DRwLHA+ZLOJTVyqpekP0TEp3bznkVhw5YdPL9oLVd+aLibl8wsb1pqYuqZ/IwjtSf14OTnGlKdzrvzCjBS0nBJnYHLSTVRpXsEOEVSR0ndgBOA+RFxY0QMiYhhyXVPlkpyAHh8/ip21AfnHHlAvkMxsxLW0kS57wFI+itwbERsSo6/CzywuxeOiDpJXwSmA2XAnRExT9I1yflbI2K+pGnAHFJ9DXdExNx9vKeCN3XuSgb17soxQ/vkOxQzK2HZTJQ7ENiedryd7EYxERFTgClNym5tcjwRmNjCazwNPJ3N+xWDTVt38Ozra/nUiQe5ecnM8iqbBPF74GVJD5HqZL4I+F1OoyphTy5Yzfb6Bj56lJuXzCy/shnFdJOkqcApSdGVETErt2GVrimVK9m/VxfGDu2b71DMrMS1NIqpV0RslNQPeDv5aTzXLyLW5T680lKzrY6nF67hk8cfSIcObl4ys/xq6QniHlLLfc9k1/kLSo73ZE6EZeGphavZVtfg0Utm1i60NIrpvORPby/aRqZWVlHRowvjhvXLdyhmZi02MR3b0oUR8Y/WD6d01W6v58kFq7n4uMGUuXnJzNqBlpqYftzCuQDObOVYStozr6+mdke9114ys3ajpSamM9oykFI3pbKKft07c/xwNy+ZWfuQzTwIkmW+D2fXHeU8F6KVbN1RzxPzV3H+MYPoWJbNArtmZrm32wQh6TvA6aQSxBRSGwD9DU+WazXPvr6Gmu313jnOzNqVbL6uXkJqOe6qiLiS1GqrXXIaVYmZOreK3uWdOOng/fIdipnZTtkkiNqIaADqJPUCVuM5EK1mW109j7+2irMP359Obl4ys3Ykmz6IGcne0beTmjS3GXg5p1GVkOcXrWXTtjrOHePmJTNrX1qaB/FL4J6I+Nek6NZkae5eETGnTaIrAVMqq+jZtSMfOqQi36GYme2ipSeIN4AfSxoI/BG4NyJmt01YpWF7XQN/nVfFWYfvT+eObl4ys/al2U+liPh5RJwEnAasA34jab6kb0sa1WYRFrEXFr/Lxq11nhxnZu3Sbr+2RsQ7EfHDiBgLXEFqP4j5OY+sBEytXEmPLh05eaSbl8ys/dltgpDUSdLHJN0NTAVeBy7OeWRFrq6+genzqvjwYQPo2qks3+GYmb1PS53UZwGfBD5KatTSfcDVEVHTRrEVtZfeWsf6LTs8Oc7M2q2WOqm/SWpPiK97c6DWN6VyJd06l3H66P75DsXMLCMv1pcH9Q3B9HlVnHGom5fMrP3y2Mo8eOXtdazdvN2jl8ysXXOCyIMplSvp2qmDm5fMrF1zgmhjDQ3B1LlVnD5qAN27ZLXauplZXjhBtLGZS9azZtM2zhlzQL5DMTNrkRNEG5tSuZLOHTtw5qED8h2KmVmLnCDaUENDMG1uFaeO7E/Prp3yHY6ZWYucINrQ7GXVrNywlXPdvGRmBcAJog1NrVxJpzLxkcP3z3coZma75QTRRiKCKZVVnDKyP73cvGRmBcAJoo1ULt/A8upazjnSzUtmVhicINrIlMoqOnYQZ7l5ycwKhBNEG4gIps5dyQcPqaBPt875DsfMLCtOEG3gtZUbeefdLZzr5iUzKyA5TRCSJkhaKGmRpBuaqXO6pNmS5kl6JinrKullSa8m5d/LZZy5NrWyirIO4uwjnCDMrHDkbDEgSWXAzcBZwDLgFUmTI+K1tDp9gFuACRGxRFLj9OJtwJkRseTVAJAAAAuqSURBVFlSJ+BvkqZGxIu5ijdXUqOXVnLiiH706+7mJTMrHLl8gjgeWBQRiyNiO6kd6S5oUucKYFJELAGIiNXJnxERm5M6nZKfyGGsObNw1SYWr63xznFmVnBymSAGA0vTjpclZelGAX0lPS1ppqTPNJ6QVCZpNrAaeCwiXsr0JpKuljRD0ow1a9a08i3suymVVUgw3s1LZlZgcpkglKGs6VNAR+A4Uvtejwe+JWkUQETUR8QxwBDgeElHZnqTiLgtIsZFxLj+/dvf/gpTK1dy/LB+9O/ZJd+hmJntkVwmiGXA0LTjIcCKDHWmRURNRKwFngWOTq8QEdXA08CE3IWaG2+s2sQbqzdz7hg3L5lZ4cllgngFGClpuKTOwOXA5CZ1HgFOkdRRUjfgBGC+pP5JBzaSyoGPAAtyGGtOTJ1bBcAED281swKUs1FMEVEn6YvAdKAMuDMi5km6Jjl/a0TMlzQNmAM0AHdExFxJRwG/TUZCdQDuj4g/5yrWXJlSuZJxB/Vl/15d8x2Kmdkey+melxExBZjSpOzWJscTgYlNyuYAY3MZW64tXrOZBVWb+PZ5h+c7FDOzveKZ1Dni5iUzK3ROEDkyde5Kxh7Yh0F9yvMdipnZXnGCyIEl725h7vKNnOvJcWZWwJwgcmDq3JWAm5fMrLA5QeTAlLlVHDWkN0P7dct3KGZme80JopUtW7+FV5dWe+0lMyt4ThCtbFoyeslbi5pZoXOCaGVTKldy+MBeDKvonu9QzMz2iRNEK1q5oZZ/LKnm3DF+ejCzwucE0Yp2Ni95cT4zKwJOEK1oamUVo/fvycH9e+Q7FDOzfeYE0UpWb9zKK++s4xw3L5lZkXCCaCXT51URgfd+MLOikdPVXEvBw7OWM3H6QpZX19Kxg3htxUZG7d8z32GZme0zJ4h98PCs5dw4qZLaHfUA1DUEN06qBODCsU233zYzKyxuYtoHE6cv3JkcGtXuqGfi9IV5isjMrPU4QeyD5dW1GctXNFNuZlZI3MS0FxZWbWrxKcF7QJhZMXCC2ANL123hp4+/zkOzltOjc0c+OmYgT8xfxda6hp11yjuVcf340XmM0sysdThBZGHt5m388slF3P3SO3SQuPqUEVxz2sH07d555yimFdW1DOpTzvXjR7uD2syKghNECzZt3cHtz73FHc8tZuuOei4dN5RrPzKSgb3fa0K6cOxgJwQzK0pOEBls3VHPH158h5ufWsT6LTs4d8wBXHfWaA4Z4CU0zKx0lHyCSG8iGti7K6eN7s+zr69leXUtp4ys4PrxozlqSJ98h2lm1uZKOkE0nei2YsNW7n15KUP7lnP3P5/Ahw6pyHOEZmb5U9LzIDJNdAOobwgnBzMreSWdIJqb0LZyw9Y2jsTMrP0p6QTR3IQ2T3QzMyvxBHH9+NGUdyrbpcwT3czMUkq6k7px/oInupmZvV9JJwjwRDczs+aUdBOTmZk1zwnCzMwycoIwM7OMnCDMzCwjJwgzM8tIEZHvGFqNpDXAO1lUrQDW5jicfCr2+4Piv8divz/wPbYXB0VE/0wniipBZEvSjIgYl+84cqXY7w+K/x6L/f7A91gI3MRkZmYZOUGYmVlGpZogbst3ADlW7PcHxX+PxX5/4Hts90qyD8LMzHavVJ8gzMxsN5wgzMwso5JKEJImSFooaZGkG/IdT2uQdKek1ZLmppX1k/SYpDeSP/vmM8Z9IWmopKckzZc0T9K1SXkx3WNXSS9LejW5x+8l5UVzjwCSyiTNkvTn5LjY7u9tSZWSZkuakZQV9D2WTIKQVAbcDJwDHA58UtLh+Y2qVdwFTGhSdgPwRESMBJ5IjgtVHfC1iDgMOBH4QvL3Vkz3uA04MyKOBo4BJkg6keK6R4Brgflpx8V2fwBnRMQxaXMfCvoeSyZBAMcDiyJicURsB+4DLshzTPssIp4F1jUpvgD4bfL7b4EL2zSoVhQRKyPiH8nvm0h9wAymuO4xImJzctgp+QmK6B4lDQE+CtyRVlw099eCgr7HUkoQg4GlacfLkrJitH9ErITUBywwIM/xtApJw4CxwEsU2T0mzS+zgdXAYxFRbPf4M+AbQENaWTHdH6SS+l8lzZR0dVJW0PdYSjvKKUOZx/gWCEk9gD8BX4mIjVKmv87CFRH1wDGS+gAPSToy3zG1FknnAasjYqak0/MdTw59KCJWSBoAPCZpQb4D2lel9ASxDBiadjwEWJGnWHJtlaSBAMmfq/Mczz6R1IlUcrg7IiYlxUV1j40iohp4mlS/UrHc44eA8yW9Tapp90xJf6B47g+AiFiR/LkaeIhUs3ZB32MpJYhXgJGShkvqDFwOTM5zTLkyGfhs8vtngUfyGMs+UepR4dfA/Ij4SdqpYrrH/smTA5LKgY8ACyiSe4yIGyNiSEQMI/Xv7smI+BRFcn8AkrpL6tn4O3A2MJcCv8eSmkkt6VxSbaFlwJ0RcVOeQ9pnku4FTie1rPAq4DvAw8D9wIHAEuATEdG0I7sgSDoZeA6o5L3262+S6ocolns8ilQHZhmpL233R8R/StqPIrnHRkkT09cj4rxiuj9JI0g9NUCq6f6eiLip0O+xpBKEmZllr5SamMzMbA84QZiZWUZOEGZmlpEThJmZZeQEYWZmGTlBWMGQ9FNJX0k7ni7pjrTjH0u6roXr75J0SfL705Let5m8pE6SfpCsvjk3WWX1nOTc25Iq9iLune/bzPmbkxVAX5NUm/w+W9IlkqY0zpFoTZIGNq6q2sz5zpKelVRKqy1YE04QVkj+DnwQQFIHUnM/jkg7/0Hg+X18j+8DA4EjI+JI4GNAz318zRZFxBci4hjgXODNZDXQYyLiwYg4N5ld3dquA25vIabtpFYfvSwH720FwgnCCsnzJAmCVGKYC2yS1FdSF+AwYJakb0t6JXkCuE1ZLtwkqRtwFfCliNgGEBGrIuL+DHWvS15/bpOnms9ImpPs7fD7DNd9P3miyOrfXuNTi6RhkhZIuiN5z7slfUTS88nTzvFJ/e5K7RHyilJ7LzS3YvHFwLTkmiOSJ6XZSewjkzoPA/8nmzitOPnx0QpGshBanaQDSSWKF0ityHsSsAGYExHbJf0yIv4TIPmQPg94NIu3OARYEhEbW6ok6TjgSuAEUotAviTpGWA78O+kFm1bK6lfk+t+BPQGroy9m6F6CPAJ4GpSS8dcAZwMnE9qdvmFyfs/GRGfS5qmXpb0eETUpMUxHFjfmASBa4CfR8TdyTI0ZUn5XOADexGnFQk/QVihaXyKaEwQL6Qd/z2pc4aklyRVAmeyazNUazgZeCgiapJ9HCYBpyTv9WBErAVosqTCt4A+EfH5vUwOAG9FRGVENADzSG1EE6SWIRmW1DkbuEGppcOfBrqSWuYh3UBgTdrxC8A3Jf0bcFBE1Cbx1wPbG9cYstLjBGGFprEfYgypb7gvknqC+CDwvKSuwC3AJRExhlQ7e9csX3sRcGAWH4jNNVmJ5peQfwU4rulTxR7alvZ7Q9pxA++1Bgi4OK0f48CISN/FDaCWtP8mEXEPqaeQWmC6pDPT6nYBtu5DzFbAnCCs0DxPqsloXUTUJ9/S+5BKEi/w3gffWqX2kGh29FBTEbGF1Mqxv0iaWhpH+3yqSdVngQsldUtW7ryI1IKCTwCXJgu00SQZTAN+APwlx9/IpwNfaux3kTQ2Q53Xee+Jo3GhucUR8QtSq48elZTvB6yJiB05jNfaMScIKzSVpEYvvdikbENErE1G/NyelD1M6pv7nvgPUs0vr0mam7xGenMMyRaodwEvk1pV9o6ImBUR84CbgGckvQr8pMl1DySxTU6W9c6F75PasnROEv/3m1ZI+iPelHRIUnQZMDdpljoU+F1SfgYwJUdxWgHwaq5mJUjSRcBxEfEfLdSZBNwYEQvbLjJrTzyKyawERcRDjU1hmSRNbA87OZQ2P0GYmVlG7oMwM7OMnCDMzCwjJwgzM8vICcLMzDJygjAzs4z+f1uGjPL173e5AAAAAElFTkSuQmCC\n" + }, + "metadata": { + "needs_background": "light" + } + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "plt.title('Learning Curve')\n", + "plt.xlabel('Wall Clock Time (s)')\n", + "plt.ylabel('Validation Accuracy')\n", + "plt.scatter(time_history, 1-np.array(valid_loss_history))\n", + "plt.plot(time_history, 1-np.array(best_valid_loss_history))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## 3. Customized Learner" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Some experienced automl users may have a preferred model to tune or may already have a reasonably by-hand-tuned model before launching the automl experiment. They need to select optimal configurations for the customized model mixed with standard built-in learners. \n", + "\n", + "FLAML can easily incorporate customized/new learners (preferably with sklearn API) provided by users in a real-time manner, as demonstrated below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Example of Regularized Greedy Forest\n", + "\n", + "[Regularized Greedy Forest](https://arxiv.org/abs/1109.0887) (RGF) is a machine learning method currently not included in FLAML. The RGF has many tuning parameters, the most critical of which are: `[max_leaf, n_iter, n_tree_search, opt_interval, min_samples_leaf]`. To run a customized/new learner, the user needs to provide the following information:\n", + "* an implementation of the customized/new learner\n", + "* a list of hyperparameter names and types\n", + "* rough ranges of hyperparameters (i.e., upper/lower bounds)\n", + "* choose initial value corresponding to low cost for cost-related hyperparameters (e.g., initial value for max_leaf and n_iter should be small)\n", + "\n", + "In this example, the above information for RGF is wrapped in a python class called *MyRegularizedGreedyForest* that exposes the hyperparameters." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "''' SKLearnEstimator is the super class for a sklearn learner '''\n", + "from flaml.model import SKLearnEstimator\n", + "from flaml import tune\n", + "from rgf.sklearn import RGFClassifier, RGFRegressor\n", + "\n", + "\n", + "class MyRegularizedGreedyForest(SKLearnEstimator):\n", + "\n", + "\n", + " def __init__(self, task = 'binary:logistic', n_jobs = 1, **params):\n", + " '''Constructor\n", + " \n", + " Args:\n", + " task: A string of the task type, one of\n", + " 'binary:logistic', 'multi:softmax', 'regression'\n", + " n_jobs: An integer of the number of parallel threads\n", + " params: A dictionary of the hyperparameter names and values\n", + " '''\n", + "\n", + " super().__init__(task, **params)\n", + "\n", + " '''task=regression for RGFRegressor; \n", + " binary:logistic and multiclass:softmax for RGFClassifier'''\n", + " if 'regression' in task:\n", + " self.estimator_class = RGFRegressor\n", + " else:\n", + " self.estimator_class = RGFClassifier\n", + "\n", + " # convert to int for integer hyperparameters\n", + " self.params = {\n", + " \"n_jobs\": n_jobs,\n", + " 'max_leaf': int(params['max_leaf']),\n", + " 'n_iter': int(params['n_iter']),\n", + " 'n_tree_search': int(params['n_tree_search']),\n", + " 'opt_interval': int(params['opt_interval']),\n", + " 'learning_rate': params['learning_rate'],\n", + " 'min_samples_leaf':int(params['min_samples_leaf'])\n", + " } \n", + "\n", + " @classmethod\n", + " def search_space(cls, data_size, task):\n", + " '''[required method] search space\n", + "\n", + " Returns:\n", + " A dictionary of the search space. \n", + " Each key is the name of a hyperparameter, and value is a dict with\n", + " its domain and init_value (optional), cat_hp_cost (optional) \n", + " e.g., \n", + " {'domain': tune.randint(lower=1, upper=10), 'init_value': 1}\n", + " '''\n", + " space = { \n", + " 'max_leaf': {'domain': tune.qloguniform(lower = 4, upper = data_size, q = 1), 'init_value': 4},\n", + " 'n_iter': {'domain': tune.qloguniform(lower = 1, upper = data_size, q = 1), 'init_value': 1},\n", + " 'n_tree_search': {'domain': tune.qloguniform(lower = 1, upper = 32768, q = 1), 'init_value': 1},\n", + " 'opt_interval': {'domain': tune.qloguniform(lower = 1, upper = 10000, q = 1), 'init_value': 100},\n", + " 'learning_rate': {'domain': tune.loguniform(lower = 0.01, upper = 20.0)},\n", + " 'min_samples_leaf': {'domain': tune.qloguniform(lower = 1, upper = 20, q = 1), 'init_value': 20},\n", + " }\n", + " return space\n", + "\n", + " @classmethod\n", + " def size(cls, config):\n", + " '''[optional method] memory size of the estimator in bytes\n", + " \n", + " Args:\n", + " config - the dict of the hyperparameter config\n", + "\n", + " Returns:\n", + " A float of the memory size required by the estimator to train the\n", + " given config\n", + " '''\n", + " max_leaves = int(round(config['max_leaf']))\n", + " n_estimators = int(round(config['n_iter']))\n", + " return (max_leaves*3 + (max_leaves-1)*4 + 1.0)*n_estimators*8\n", + "\n", + " @classmethod\n", + " def cost_relative2lgbm(cls):\n", + " '''[optional method] relative cost compared to lightgbm\n", + " '''\n", + " return 1.0\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Add Customized Learner and Run FLAML AutoML\n", + "\n", + "After adding RGF into the list of learners, we run automl by tuning hyperpameters of RGF as well as the default learners. " + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "automl = AutoML()\n", + "automl.add_learner(learner_name = 'RGF', learner_class = MyRegularizedGreedyForest)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "INFO: 'flaml_custom' does not exist. Creating a new experiment\n", + "[flaml.automl: 02-05 13:31:31] {820} INFO - Evaluation method: holdout\n", + "[flaml.automl: 02-05 13:31:31] {545} INFO - Using StratifiedKFold\n", + "[flaml.automl: 02-05 13:31:31] {841} INFO - Minimizing error metric: 1-accuracy\n", + "[flaml.automl: 02-05 13:31:31] {861} INFO - List of ML learners in AutoML Run: ['RGF', 'lgbm', 'rf', 'xgboost']\n", + "[flaml.automl: 02-05 13:31:31] {920} INFO - iteration 0 current learner RGF\n", + "[flaml.automl: 02-05 13:31:34] {1074} INFO - at 1.2s,\tbest RGF's error=0.3840,\tbest RGF's error=0.3840\n", + "[flaml.automl: 02-05 13:31:34] {920} INFO - iteration 1 current learner RGF\n", + "[flaml.automl: 02-05 13:31:35] {1074} INFO - at 3.8s,\tbest RGF's error=0.3840,\tbest RGF's error=0.3840\n", + "[flaml.automl: 02-05 13:31:35] {920} INFO - iteration 2 current learner RGF\n", + "[flaml.automl: 02-05 13:31:36] {1074} INFO - at 4.6s,\tbest RGF's error=0.3840,\tbest RGF's error=0.3840\n", + "[flaml.automl: 02-05 13:31:36] {920} INFO - iteration 3 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:38] {1074} INFO - at 4.6s,\tbest lgbm's error=0.3777,\tbest lgbm's error=0.3777\n", + "[flaml.automl: 02-05 13:31:38] {920} INFO - iteration 4 current learner RGF\n", + "[flaml.automl: 02-05 13:31:41] {1074} INFO - at 7.7s,\tbest RGF's error=0.3751,\tbest RGF's error=0.3751\n", + "[flaml.automl: 02-05 13:31:41] {920} INFO - iteration 5 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:41] {1074} INFO - at 9.7s,\tbest lgbm's error=0.3777,\tbest RGF's error=0.3751\n", + "[flaml.automl: 02-05 13:31:41] {920} INFO - iteration 6 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:44] {1074} INFO - at 9.7s,\tbest lgbm's error=0.3669,\tbest lgbm's error=0.3669\n", + "[flaml.automl: 02-05 13:31:44] {920} INFO - iteration 7 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:44] {1074} INFO - at 12.7s,\tbest lgbm's error=0.3669,\tbest lgbm's error=0.3669\n", + "[flaml.automl: 02-05 13:31:44] {920} INFO - iteration 8 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:46] {1074} INFO - at 12.7s,\tbest lgbm's error=0.3662,\tbest lgbm's error=0.3662\n", + "[flaml.automl: 02-05 13:31:46] {920} INFO - iteration 9 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:48] {1074} INFO - at 14.9s,\tbest lgbm's error=0.3636,\tbest lgbm's error=0.3636\n", + "[flaml.automl: 02-05 13:31:48] {920} INFO - iteration 10 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:50] {1074} INFO - at 16.8s,\tbest lgbm's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:50] {920} INFO - iteration 11 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:50] {1074} INFO - at 18.7s,\tbest lgbm's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:50] {920} INFO - iteration 12 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:50] {1074} INFO - at 18.7s,\tbest lgbm's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:50] {920} INFO - iteration 13 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:50] {1074} INFO - at 18.8s,\tbest lgbm's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:50] {920} INFO - iteration 14 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:50] {1074} INFO - at 18.9s,\tbest lgbm's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:50] {920} INFO - iteration 15 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:50] {1074} INFO - at 18.9s,\tbest lgbm's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:50] {920} INFO - iteration 16 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:50] {1074} INFO - at 19.1s,\tbest lgbm's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:50] {920} INFO - iteration 17 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:51] {1074} INFO - at 20.1s,\tbest lgbm's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:51] {920} INFO - iteration 18 current learner xgboost\n", + "[flaml.automl: 02-05 13:31:51] {1074} INFO - at 20.1s,\tbest xgboost's error=0.3768,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:51] {920} INFO - iteration 19 current learner xgboost\n", + "[flaml.automl: 02-05 13:31:51] {1074} INFO - at 20.2s,\tbest xgboost's error=0.3768,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:51] {920} INFO - iteration 20 current learner xgboost\n", + "[flaml.automl: 02-05 13:31:51] {1074} INFO - at 20.3s,\tbest xgboost's error=0.3768,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:51] {920} INFO - iteration 21 current learner xgboost\n", + "[flaml.automl: 02-05 13:31:51] {1074} INFO - at 20.3s,\tbest xgboost's error=0.3768,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:51] {920} INFO - iteration 22 current learner xgboost\n", + "[flaml.automl: 02-05 13:31:51] {1074} INFO - at 20.4s,\tbest xgboost's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:51] {920} INFO - iteration 23 current learner xgboost\n", + "[flaml.automl: 02-05 13:31:51] {1074} INFO - at 20.5s,\tbest xgboost's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:51] {920} INFO - iteration 24 current learner xgboost\n", + "[flaml.automl: 02-05 13:31:52] {1074} INFO - at 20.6s,\tbest xgboost's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:52] {920} INFO - iteration 25 current learner xgboost\n", + "[flaml.automl: 02-05 13:31:52] {1074} INFO - at 20.7s,\tbest xgboost's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:52] {920} INFO - iteration 26 current learner xgboost\n", + "[flaml.automl: 02-05 13:31:52] {1074} INFO - at 20.8s,\tbest xgboost's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 02-05 13:31:52] {920} INFO - iteration 27 current learner xgboost\n", + "[flaml.automl: 02-05 13:31:55] {1074} INFO - at 21.1s,\tbest xgboost's error=0.3611,\tbest xgboost's error=0.3611\n", + "[flaml.automl: 02-05 13:31:55] {920} INFO - iteration 28 current learner xgboost\n", + "[flaml.automl: 02-05 13:31:55] {1074} INFO - at 24.1s,\tbest xgboost's error=0.3611,\tbest xgboost's error=0.3611\n", + "[flaml.automl: 02-05 13:31:55] {920} INFO - iteration 29 current learner lgbm\n", + "[flaml.automl: 02-05 13:31:56] {1074} INFO - at 24.8s,\tbest lgbm's error=0.3618,\tbest xgboost's error=0.3611\n", + "[flaml.automl: 02-05 13:31:56] {920} INFO - iteration 30 current learner RGF\n", + "[flaml.automl: 02-05 13:31:57] {1074} INFO - at 25.8s,\tbest RGF's error=0.3751,\tbest xgboost's error=0.3611\n", + "[flaml.automl: 02-05 13:31:57] {920} INFO - iteration 31 current learner xgboost\n", + "[flaml.automl: 02-05 13:31:57] {1074} INFO - at 26.2s,\tbest xgboost's error=0.3611,\tbest xgboost's error=0.3611\n", + "[flaml.automl: 02-05 13:31:57] {920} INFO - iteration 32 current learner xgboost\n", + "[flaml.automl: 02-05 13:32:00] {1074} INFO - at 27.2s,\tbest xgboost's error=0.3523,\tbest xgboost's error=0.3523\n", + "[flaml.automl: 02-05 13:32:00] {920} INFO - iteration 33 current learner xgboost\n", + "[flaml.automl: 02-05 13:32:00] {1074} INFO - at 29.5s,\tbest xgboost's error=0.3523,\tbest xgboost's error=0.3523\n", + "[flaml.automl: 02-05 13:32:00] {920} INFO - iteration 34 current learner xgboost\n", + "[flaml.automl: 02-05 13:32:01] {1074} INFO - at 30.2s,\tbest xgboost's error=0.3523,\tbest xgboost's error=0.3523\n", + "[flaml.automl: 02-05 13:32:01] {920} INFO - iteration 35 current learner xgboost\n", + "[flaml.automl: 02-05 13:32:05] {1074} INFO - at 32.3s,\tbest xgboost's error=0.3503,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 02-05 13:32:05] {920} INFO - iteration 36 current learner rf\n", + "[flaml.automl: 02-05 13:32:06] {1074} INFO - at 34.8s,\tbest rf's error=0.3998,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 02-05 13:32:06] {920} INFO - iteration 37 current learner rf\n", + "[flaml.automl: 02-05 13:32:06] {1074} INFO - at 35.2s,\tbest rf's error=0.3998,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 02-05 13:32:06] {920} INFO - iteration 38 current learner rf\n", + "[flaml.automl: 02-05 13:32:07] {1074} INFO - at 35.6s,\tbest rf's error=0.3998,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 02-05 13:32:07] {920} INFO - iteration 39 current learner rf\n", + "[flaml.automl: 02-05 13:32:07] {1074} INFO - at 35.9s,\tbest rf's error=0.3998,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 02-05 13:32:07] {920} INFO - iteration 40 current learner rf\n", + "[flaml.automl: 02-05 13:32:07] {1074} INFO - at 36.4s,\tbest rf's error=0.3998,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 02-05 13:32:07] {920} INFO - iteration 41 current learner xgboost\n", + "[flaml.automl: 02-05 13:32:08] {1074} INFO - at 37.3s,\tbest xgboost's error=0.3503,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 02-05 13:32:08] {920} INFO - iteration 42 current learner RGF\n", + "[flaml.automl: 02-05 13:32:09] {1074} INFO - at 38.4s,\tbest RGF's error=0.3751,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 02-05 13:32:09] {920} INFO - iteration 43 current learner xgboost\n", + "[flaml.automl: 02-05 13:32:11] {1074} INFO - at 39.9s,\tbest xgboost's error=0.3503,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 02-05 13:32:11] {920} INFO - iteration 44 current learner xgboost\n", + "[flaml.automl: 02-05 13:32:12] {1074} INFO - at 41.4s,\tbest xgboost's error=0.3503,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 02-05 13:32:12] {920} INFO - iteration 45 current learner xgboost\n", + "[flaml.automl: 02-05 13:32:13] {1074} INFO - at 42.4s,\tbest xgboost's error=0.3503,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 02-05 13:32:13] {920} INFO - iteration 46 current learner rf\n", + "[flaml.automl: 02-05 13:32:14] {1074} INFO - at 43.3s,\tbest rf's error=0.3954,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 02-05 13:32:14] {920} INFO - iteration 47 current learner xgboost\n", + "[flaml.automl: 02-05 13:32:15] {1074} INFO - at 43.7s,\tbest xgboost's error=0.3503,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 02-05 13:32:15] {920} INFO - iteration 48 current learner RGF\n", + "[flaml.automl: 02-05 13:32:32] {1074} INFO - at 60.7s,\tbest RGF's error=0.3572,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 02-05 13:32:32] {1114} INFO - selected model: XGBClassifier(base_score=0.5, booster='gbtree',\n", + " colsample_bylevel=0.9320142747883016, colsample_bynode=1,\n", + " colsample_bytree=0.9700688784239055, gamma=0, gpu_id=-1,\n", + " grow_policy='lossguide', importance_type='gain',\n", + " interaction_constraints=None, learning_rate=0.1803601073103824,\n", + " max_delta_step=0, max_depth=0, max_leaves=22,\n", + " min_child_weight=14.640318514250904, missing=nan,\n", + " monotone_constraints=None, n_estimators=143, n_jobs=-1,\n", + " num_parallel_tree=1, random_state=0,\n", + " reg_alpha=1.986243711660331e-08, reg_lambda=0.19460138956942644,\n", + " scale_pos_weight=1, subsample=1.0, tree_method='hist',\n", + " validate_parameters=False, verbosity=0)\n", + "[flaml.automl: 02-05 13:32:32] {875} INFO - fit succeeded\n" + ] + } + ], + "source": [ + "settings = {\n", + " \"time_budget\": 60, # total running time in seconds\n", + " \"metric\": 'accuracy', \n", + " \"estimator_list\": ['RGF', 'lgbm', 'rf', 'xgboost'], # list of ML learners\n", + " \"task\": 'classification', # task type \n", + " \"sample\": True, # whether to subsample training data\n", + " \"log_file_name\": 'airlines_experiment.log', # cache directory of flaml log files \n", + " \"log_training_metric\": True, # whether to log training metric\n", + "}\n", + "\n", + "mlflow.set_experiment(\"flaml_custom\")\n", + "with mlflow.start_run() as run:\n", + " '''The main flaml automl API'''\n", + " automl.fit(X_train = X_train, y_train = y_train, **settings)" + ] + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "display_name": "Python 3.7.7 64-bit ('flaml': conda)", + "metadata": { + "interpreter": { + "hash": "bfcd9a6a9254a5e160761a1fd7a9e444f011592c6770d9f4180dde058a9df5dd" + } + } + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7-final" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/notebook/flaml_demo.ipynb b/notebook/flaml_demo.ipynb index cde79fa37..ff8ccc345 100644 --- a/notebook/flaml_demo.ipynb +++ b/notebook/flaml_demo.ipynb @@ -8,7 +8,7 @@ } }, "source": [ - "Copyright (c) 2020 Microsoft Corporation. All rights reserved. \n", + "Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. \n", "\n", "Licensed under the MIT License.\n", "\n", @@ -27,7 +27,7 @@ "\n", "In this notebook, we use one real data example (binary classification) to showcase how to ues FLAML library.\n", "\n", - "FLAML requires `Python>=3.6`. To run this notebook example, please install flaml with the [notebook] option:\n", + "FLAML requires `Python>=3.6`. To run this notebook example, please install flaml with the `notebook` option:\n", "```bash\n", "pip install flaml[notebook]\n", "```" @@ -61,10 +61,7 @@ "output_type": "stream", "name": "stdout", "text": [ - "load dataset from ./openml_ds1169.pkl\n", - "Dataset name: airlines\n", - "X_train.shape: (404537, 7), y_train.shape: (404537,);\n", - "X_test.shape: (134846, 7), y_test.shape: (134846,)\n" + "load dataset from ./openml_ds1169.pkl\nDataset name: airlines\nX_train.shape: (404537, 7), y_train.shape: (404537,);\nX_test.shape: (134846, 7), y_test.shape: (134846,)\n" ] } ], @@ -134,46 +131,64 @@ "output_type": "stream", "name": "stderr", "text": [ - "[flaml.automl: 12-15 07:41:38] {660} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost']\n", - "[flaml.automl: 12-15 07:41:38] {665} INFO - Evaluation method: holdout\n", - "[flaml.automl: 12-15 07:41:38] {683} INFO - Minimizing error metric: 1-accuracy\n", - "[flaml.automl: 12-15 07:41:39] {327} INFO - Using StratifiedKFold\n", - "[flaml.automl: 12-15 07:41:39] {728} INFO - iteration 0 current learner lgbm\n", - "[flaml.automl: 12-15 07:41:41] {793} INFO - at 3.6s,\tbest lgbm's error=0.3748,\tbest lgbm's error=0.3748\n", - "[flaml.automl: 12-15 07:41:41] {728} INFO - iteration 1 current learner lgbm\n", - "[flaml.automl: 12-15 07:41:45] {793} INFO - at 7.5s,\tbest lgbm's error=0.3735,\tbest lgbm's error=0.3735\n", - "[flaml.automl: 12-15 07:41:45] {728} INFO - iteration 2 current learner lgbm\n", - "[flaml.automl: 12-15 07:41:47] {793} INFO - at 9.2s,\tbest lgbm's error=0.3668,\tbest lgbm's error=0.3668\n", - "[flaml.automl: 12-15 07:41:47] {728} INFO - iteration 3 current learner lgbm\n", - "[flaml.automl: 12-15 07:41:49] {793} INFO - at 11.4s,\tbest lgbm's error=0.3613,\tbest lgbm's error=0.3613\n", - "[flaml.automl: 12-15 07:41:49] {728} INFO - iteration 4 current learner lgbm\n", - "[flaml.automl: 12-15 07:41:53] {793} INFO - at 15.0s,\tbest lgbm's error=0.3613,\tbest lgbm's error=0.3613\n", - "[flaml.automl: 12-15 07:41:53] {728} INFO - iteration 5 current learner xgboost\n", - "[flaml.automl: 12-15 07:41:56] {793} INFO - at 18.1s,\tbest xgboost's error=0.3740,\tbest lgbm's error=0.3613\n", - "[flaml.automl: 12-15 07:41:56] {728} INFO - iteration 6 current learner lgbm\n", - "[flaml.automl: 12-15 07:42:00] {793} INFO - at 22.7s,\tbest lgbm's error=0.3613,\tbest lgbm's error=0.3613\n", - "[flaml.automl: 12-15 07:42:00] {728} INFO - iteration 7 current learner xgboost\n", - "[flaml.automl: 12-15 07:42:02] {793} INFO - at 24.8s,\tbest xgboost's error=0.3659,\tbest lgbm's error=0.3613\n", - "[flaml.automl: 12-15 07:42:02] {728} INFO - iteration 8 current learner lgbm\n", - "[flaml.automl: 12-15 07:42:11] {793} INFO - at 33.0s,\tbest lgbm's error=0.3544,\tbest lgbm's error=0.3544\n", - "[flaml.automl: 12-15 07:42:11] {728} INFO - iteration 9 current learner rf\n", - "[flaml.automl: 12-15 07:42:20] {793} INFO - at 41.9s,\tbest rf's error=0.3895,\tbest lgbm's error=0.3544\n", - "[flaml.automl: 12-15 07:42:20] {728} INFO - iteration 10 current learner xgboost\n", - "[flaml.automl: 12-15 07:42:24] {793} INFO - at 45.8s,\tbest xgboost's error=0.3659,\tbest lgbm's error=0.3544\n", - "[flaml.automl: 12-15 07:42:24] {728} INFO - iteration 11 current learner lgbm\n", - "[flaml.automl: 12-15 07:42:29] {793} INFO - at 51.5s,\tbest lgbm's error=0.3410,\tbest lgbm's error=0.3410\n", - "[flaml.automl: 12-15 07:42:29] {728} INFO - iteration 12 current learner rf\n", - "[flaml.automl: 12-15 07:42:29] {793} INFO - at 51.5s,\tbest rf's error=0.3895,\tbest lgbm's error=0.3410\n", - "[flaml.automl: 12-15 07:42:29] {728} INFO - iteration 13 current learner lgbm\n", - "[flaml.automl: 12-15 07:42:35] {793} INFO - at 57.1s,\tbest lgbm's error=0.3383,\tbest lgbm's error=0.3383\n", - "[flaml.automl: 12-15 07:42:35] {728} INFO - iteration 14 current learner xgboost\n", - "[flaml.automl: 12-15 07:42:38] {793} INFO - at 60.4s,\tbest xgboost's error=0.3659,\tbest lgbm's error=0.3383\n", - "[flaml.automl: 12-15 07:42:38] {814} INFO - LGBMClassifier(learning_rate=0.5482637744255212, max_bin=1023,\n", - " min_child_weight=1.1930700595990091, n_estimators=76,\n", - " num_leaves=67, objective='binary',\n", - " reg_alpha=3.668052110134859e-10, reg_lambda=0.49371485228257217,\n", - " subsample=0.6)\n", - "[flaml.automl: 12-15 07:42:38] {702} INFO - fit succeeded\n" + "[flaml.automl: 01-31 05:20:44] {816} INFO - Evaluation method: holdout\n", + "[flaml.automl: 01-31 05:20:44] {541} INFO - Using StratifiedKFold\n", + "[flaml.automl: 01-31 05:20:44] {837} INFO - Minimizing error metric: 1-accuracy\n", + "[flaml.automl: 01-31 05:20:44] {857} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost']\n", + "[flaml.automl: 01-31 05:20:44] {916} INFO - iteration 0 current learner lgbm\n", + "[flaml.automl: 01-31 05:20:45] {1046} INFO - at 0.9s,\tbest lgbm's error=0.3771,\tbest lgbm's error=0.3771\n", + "[flaml.automl: 01-31 05:20:45] {916} INFO - iteration 1 current learner lgbm\n", + "[flaml.automl: 01-31 05:20:45] {1046} INFO - at 1.6s,\tbest lgbm's error=0.3771,\tbest lgbm's error=0.3771\n", + "[flaml.automl: 01-31 05:20:45] {916} INFO - iteration 2 current learner lgbm\n", + "[flaml.automl: 01-31 05:20:46] {1046} INFO - at 2.7s,\tbest lgbm's error=0.3751,\tbest lgbm's error=0.3751\n", + "[flaml.automl: 01-31 05:20:46] {916} INFO - iteration 3 current learner xgboost\n", + "[flaml.automl: 01-31 05:20:49] {1046} INFO - at 5.8s,\tbest xgboost's error=0.3753,\tbest lgbm's error=0.3751\n", + "[flaml.automl: 01-31 05:20:49] {916} INFO - iteration 4 current learner rf\n", + "[flaml.automl: 01-31 05:20:57] {1046} INFO - at 13.5s,\tbest rf's error=0.3850,\tbest lgbm's error=0.3751\n", + "[flaml.automl: 01-31 05:20:57] {916} INFO - iteration 5 current learner lgbm\n", + "[flaml.automl: 01-31 05:20:58] {1046} INFO - at 14.5s,\tbest lgbm's error=0.3751,\tbest lgbm's error=0.3751\n", + "[flaml.automl: 01-31 05:20:58] {916} INFO - iteration 6 current learner lgbm\n", + "[flaml.automl: 01-31 05:21:00] {1046} INFO - at 16.2s,\tbest lgbm's error=0.3558,\tbest lgbm's error=0.3558\n", + "[flaml.automl: 01-31 05:21:00] {916} INFO - iteration 7 current learner lgbm\n", + "[flaml.automl: 01-31 05:21:01] {1046} INFO - at 17.8s,\tbest lgbm's error=0.3492,\tbest lgbm's error=0.3492\n", + "[flaml.automl: 01-31 05:21:01] {916} INFO - iteration 8 current learner lgbm\n", + "[flaml.automl: 01-31 05:21:03] {1046} INFO - at 19.2s,\tbest lgbm's error=0.3492,\tbest lgbm's error=0.3492\n", + "[flaml.automl: 01-31 05:21:03] {916} INFO - iteration 9 current learner lgbm\n", + "[flaml.automl: 01-31 05:21:05] {1046} INFO - at 20.9s,\tbest lgbm's error=0.3470,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 01-31 05:21:05] {916} INFO - iteration 10 current learner lgbm\n", + "[flaml.automl: 01-31 05:21:06] {1046} INFO - at 22.4s,\tbest lgbm's error=0.3470,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 01-31 05:21:06] {916} INFO - iteration 11 current learner lgbm\n", + "[flaml.automl: 01-31 05:21:08] {1046} INFO - at 23.9s,\tbest lgbm's error=0.3470,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 01-31 05:21:08] {916} INFO - iteration 12 current learner rf\n", + "[flaml.automl: 01-31 05:21:12] {1046} INFO - at 28.8s,\tbest rf's error=0.3843,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 01-31 05:21:12] {916} INFO - iteration 13 current learner lgbm\n", + "[flaml.automl: 01-31 05:21:14] {1046} INFO - at 30.3s,\tbest lgbm's error=0.3470,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 01-31 05:21:14] {916} INFO - iteration 14 current learner xgboost\n", + "[flaml.automl: 01-31 05:21:16] {1046} INFO - at 32.0s,\tbest xgboost's error=0.3753,\tbest lgbm's error=0.3470\n", + "[flaml.automl: 01-31 05:21:16] {916} INFO - iteration 15 current learner lgbm\n", + "[flaml.automl: 01-31 05:21:19] {1046} INFO - at 35.0s,\tbest lgbm's error=0.3412,\tbest lgbm's error=0.3412\n", + "[flaml.automl: 01-31 05:21:19] {916} INFO - iteration 16 current learner xgboost\n", + "[flaml.automl: 01-31 05:21:20] {1046} INFO - at 36.8s,\tbest xgboost's error=0.3753,\tbest lgbm's error=0.3412\n", + "[flaml.automl: 01-31 05:21:20] {916} INFO - iteration 17 current learner lgbm\n", + "[flaml.automl: 01-31 05:21:24] {1046} INFO - at 40.6s,\tbest lgbm's error=0.3374,\tbest lgbm's error=0.3374\n", + "[flaml.automl: 01-31 05:21:24] {916} INFO - iteration 18 current learner xgboost\n", + "[flaml.automl: 01-31 05:21:26] {1046} INFO - at 42.3s,\tbest xgboost's error=0.3750,\tbest lgbm's error=0.3374\n", + "[flaml.automl: 01-31 05:21:26] {916} INFO - iteration 19 current learner lgbm\n", + "[flaml.automl: 01-31 05:21:27] {1046} INFO - at 43.6s,\tbest lgbm's error=0.3374,\tbest lgbm's error=0.3374\n", + "[flaml.automl: 01-31 05:21:27] {916} INFO - iteration 20 current learner lgbm\n", + "[flaml.automl: 01-31 05:21:35] {1046} INFO - at 51.5s,\tbest lgbm's error=0.3311,\tbest lgbm's error=0.3311\n", + "[flaml.automl: 01-31 05:21:35] {916} INFO - iteration 21 current learner lgbm\n", + "[flaml.automl: 01-31 05:21:41] {1046} INFO - at 56.9s,\tbest lgbm's error=0.3311,\tbest lgbm's error=0.3311\n", + "[flaml.automl: 01-31 05:21:41] {916} INFO - iteration 22 current learner rf\n", + "[flaml.automl: 01-31 05:21:41] {1048} INFO - no enough budget for learner rf\n", + "[flaml.automl: 01-31 05:21:41] {916} INFO - iteration 23 current learner xgboost\n", + "[flaml.automl: 01-31 05:21:41] {1048} INFO - no enough budget for learner xgboost\n", + "[flaml.automl: 01-31 05:21:41] {1086} INFO - selected model: LGBMClassifier(colsample_bytree=0.9997863921359742,\n", + " learning_rate=0.1564464373197609, max_bin=511,\n", + " min_child_weight=7.427173668000723, n_estimators=18,\n", + " num_leaves=1846, objective='binary',\n", + " reg_alpha=6.349231150788211e-09, reg_lambda=0.8927146483558472)\n", + "[flaml.automl: 01-31 05:21:41] {871} INFO - fit succeeded\n" ] } ], @@ -207,7 +222,7 @@ "output_type": "stream", "name": "stdout", "text": [ - "Best ML leaner: lgbm\nBest hyperparmeter config: {'n_estimators': 76.23660313632638, 'max_leaves': 66.93360726547702, 'min_child_weight': 1.1930700595990091, 'learning_rate': 0.5482637744255212, 'subsample': 0.6, 'log_max_bin': 10.0, 'reg_alpha': 3.668052110134859e-10, 'reg_lambda': 0.49371485228257217, 'colsample_bytree': 1.0}\nBest accuracy on validation data: 0.6617\nTraining duration of best run: 5.522 s\n" + "Best ML leaner: lgbm\nBest hyperparmeter config: {'n_estimators': 18.0, 'max_leaves': 1846.0, 'min_child_weight': 7.427173668000723, 'learning_rate': 0.1564464373197609, 'subsample': 1.0, 'log_max_bin': 9.0, 'colsample_bytree': 0.9997863921359742, 'reg_alpha': 6.349231150788211e-09, 'reg_lambda': 0.8927146483558472}\nBest accuracy on validation data: 0.6689\nTraining duration of best run: 7.89 s\n" ] } ], @@ -232,11 +247,11 @@ "output_type": "execute_result", "data": { "text/plain": [ - "LGBMClassifier(learning_rate=0.5482637744255212, max_bin=1023,\n", - " min_child_weight=1.1930700595990091, n_estimators=76,\n", - " num_leaves=67, objective='binary',\n", - " reg_alpha=3.668052110134859e-10, reg_lambda=0.49371485228257217,\n", - " subsample=0.6)" + "LGBMClassifier(colsample_bytree=0.9997863921359742,\n", + " learning_rate=0.1564464373197609, max_bin=511,\n", + " min_child_weight=7.427173668000723, n_estimators=18,\n", + " num_leaves=1846, objective='binary',\n", + " reg_alpha=6.349231150788211e-09, reg_lambda=0.8927146483558472)" ] }, "metadata": {}, @@ -303,10 +318,10 @@ "output_type": "stream", "name": "stdout", "text": [ - "accuracy = 0.6666493629770256\n", - "roc_auc = 0.7173397375696496\n", - "log_loss = 0.6095801351363471\n", - "f1 = 0.580528363863719\n" + "accuracy = 0.6681918633107397\n", + "roc_auc = 0.7208412179342409\n", + "log_loss = 0.6064652793713222\n", + "f1 = 0.5838518559855651\n" ] } ], @@ -344,7 +359,7 @@ "output_type": "stream", "name": "stdout", "text": [ - "{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 4, 'max_leaves': 4, 'min_child_weight': 20, 'learning_rate': 0.1, 'subsample': 1.0, 'log_max_bin': 8, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 4, 'max_leaves': 4, 'min_child_weight': 20, 'learning_rate': 0.1, 'subsample': 1.0, 'log_max_bin': 8, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 1.0}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 4.345841756255061, 'max_leaves': 10.353390566270846, 'min_child_weight': 20.0, 'learning_rate': 0.04742496726415123, 'subsample': 0.9045133325444861, 'log_max_bin': 10.0, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 0.9407474408255333}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 4, 'max_leaves': 4, 'min_child_weight': 20, 'learning_rate': 0.1, 'subsample': 1.0, 'log_max_bin': 8, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 1.0}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 4.0, 'max_leaves': 4.0, 'min_child_weight': 9.874086709908818, 'learning_rate': 0.21085939699865755, 'subsample': 1.0, 'log_max_bin': 3.0, 'reg_alpha': 2.6875093824678297e-10, 'reg_lambda': 0.7230542131309051, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 4.0, 'max_leaves': 4.0, 'min_child_weight': 9.874086709908818, 'learning_rate': 0.21085939699865755, 'subsample': 1.0, 'log_max_bin': 3.0, 'reg_alpha': 2.6875093824678297e-10, 'reg_lambda': 0.7230542131309051, 'colsample_bytree': 1.0}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 6.30703808576676, 'max_leaves': 4.615126183980338, 'min_child_weight': 5.419442970309873, 'learning_rate': 0.45611181052279925, 'subsample': 1.0, 'log_max_bin': 3.0, 'reg_alpha': 1e-10, 'reg_lambda': 0.5948168429421155, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 6.30703808576676, 'max_leaves': 4.615126183980338, 'min_child_weight': 5.419442970309873, 'learning_rate': 0.45611181052279925, 'subsample': 1.0, 'log_max_bin': 3.0, 'reg_alpha': 1e-10, 'reg_lambda': 0.5948168429421155, 'colsample_bytree': 1.0}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 13.346655408225933, 'max_leaves': 7.128882408907543, 'min_child_weight': 3.5378687932000563, 'learning_rate': 0.27022645132691947, 'subsample': 1.0, 'log_max_bin': 3.9062497595361734, 'reg_alpha': 4.798429666191569e-10, 'reg_lambda': 0.31076883570242425, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 13.346655408225933, 'max_leaves': 7.128882408907543, 'min_child_weight': 3.5378687932000563, 'learning_rate': 0.27022645132691947, 'subsample': 1.0, 'log_max_bin': 3.9062497595361734, 'reg_alpha': 4.798429666191569e-10, 'reg_lambda': 0.31076883570242425, 'colsample_bytree': 1.0}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 9.168255249166949, 'max_leaves': 16.406314436487644, 'min_child_weight': 1.2440119163470513, 'learning_rate': 0.34085789038743874, 'subsample': 0.8622669492242545, 'log_max_bin': 3.9088586623653176, 'reg_alpha': 6.716698258358434e-10, 'reg_lambda': 0.08971222222676836, 'colsample_bytree': 0.7}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 13.346655408225933, 'max_leaves': 7.128882408907543, 'min_child_weight': 3.5378687932000563, 'learning_rate': 0.27022645132691947, 'subsample': 1.0, 'log_max_bin': 3.9062497595361734, 'reg_alpha': 4.798429666191569e-10, 'reg_lambda': 0.31076883570242425, 'colsample_bytree': 1.0}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 19.429346778070144, 'max_leaves': 4.0, 'min_child_weight': 10.061411336518901, 'learning_rate': 0.21423102429501803, 'subsample': 1.0, 'log_max_bin': 3.903642597975916, 'reg_alpha': 3.428012749081665e-10, 'reg_lambda': 1.0, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 13.346655408225933, 'max_leaves': 7.128882408907543, 'min_child_weight': 3.5378687932000563, 'learning_rate': 0.27022645132691947, 'subsample': 1.0, 'log_max_bin': 3.9062497595361734, 'reg_alpha': 4.798429666191569e-10, 'reg_lambda': 0.31076883570242425, 'colsample_bytree': 1.0}}\n{'Current Learner': 'xgboost', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 4, 'max_leaves': 4, 'min_child_weight': 20.0, 'learning_rate': 0.1, 'subsample': 1.0, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bylevel': 1.0, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 13.346655408225933, 'max_leaves': 7.128882408907543, 'min_child_weight': 3.5378687932000563, 'learning_rate': 0.27022645132691947, 'subsample': 1.0, 'log_max_bin': 3.9062497595361734, 'reg_alpha': 4.798429666191569e-10, 'reg_lambda': 0.31076883570242425, 'colsample_bytree': 1.0}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 5.008309383948613, 'max_leaves': 9.693976070518184, 'min_child_weight': 2.0342098563400848, 'learning_rate': 0.8024873058142261, 'subsample': 0.6512672999141046, 'log_max_bin': 4.485581916675402, 'reg_alpha': 4.235615166719706e-10, 'reg_lambda': 0.1209714816813433, 'colsample_bytree': 0.7}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 13.346655408225933, 'max_leaves': 7.128882408907543, 'min_child_weight': 3.5378687932000563, 'learning_rate': 0.27022645132691947, 'subsample': 1.0, 'log_max_bin': 3.9062497595361734, 'reg_alpha': 4.798429666191569e-10, 'reg_lambda': 0.31076883570242425, 'colsample_bytree': 1.0}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 35.567533259194164, 'max_leaves': 5.242530415831202, 'min_child_weight': 6.15301098797069, 'learning_rate': 0.09099500324512855, 'subsample': 1.0, 'log_max_bin': 3.4017408370474773, 'reg_alpha': 5.436029090248796e-10, 'reg_lambda': 0.7983474113199597, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 13.346655408225933, 'max_leaves': 7.128882408907543, 'min_child_weight': 3.5378687932000563, 'learning_rate': 0.27022645132691947, 'subsample': 1.0, 'log_max_bin': 3.9062497595361734, 'reg_alpha': 4.798429666191569e-10, 'reg_lambda': 0.31076883570242425, 'colsample_bytree': 1.0}}\n{'Current Learner': 'xgboost', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 4.0, 'max_leaves': 7.946393543064438, 'min_child_weight': 20.0, 'learning_rate': 0.06505010684115302, 'subsample': 0.6, 'reg_alpha': 2.879414788721035e-10, 'reg_lambda': 0.9747843231355767, 'colsample_bylevel': 1.0, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 13.346655408225933, 'max_leaves': 7.128882408907543, 'min_child_weight': 3.5378687932000563, 'learning_rate': 0.27022645132691947, 'subsample': 1.0, 'log_max_bin': 3.9062497595361734, 'reg_alpha': 4.798429666191569e-10, 'reg_lambda': 0.31076883570242425, 'colsample_bytree': 1.0}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 26.66408504703253, 'max_leaves': 31.775028586333367, 'min_child_weight': 2.101032324057992, 'learning_rate': 0.3750121217006764, 'subsample': 0.8711935510039006, 'log_max_bin': 3.1949625175875354, 'reg_alpha': 5.201043116468452e-10, 'reg_lambda': 0.6849921466924215, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 26.66408504703253, 'max_leaves': 31.775028586333367, 'min_child_weight': 2.101032324057992, 'learning_rate': 0.3750121217006764, 'subsample': 0.8711935510039006, 'log_max_bin': 3.1949625175875354, 'reg_alpha': 5.201043116468452e-10, 'reg_lambda': 0.6849921466924215, 'colsample_bytree': 1.0}}\n{'Current Learner': 'rf', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 4, 'criterion': 1, 'max_features': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 26.66408504703253, 'max_leaves': 31.775028586333367, 'min_child_weight': 2.101032324057992, 'learning_rate': 0.3750121217006764, 'subsample': 0.8711935510039006, 'log_max_bin': 3.1949625175875354, 'reg_alpha': 5.201043116468452e-10, 'reg_lambda': 0.6849921466924215, 'colsample_bytree': 1.0}}\n{'Current Learner': 'xgboost', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 5.908417243275515, 'max_leaves': 5.7730103299390825, 'min_child_weight': 20.0, 'learning_rate': 0.18663315779626963, 'subsample': 1.0, 'reg_alpha': 1.2498021235418823e-10, 'reg_lambda': 1.0, 'colsample_bylevel': 0.6, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 26.66408504703253, 'max_leaves': 31.775028586333367, 'min_child_weight': 2.101032324057992, 'learning_rate': 0.3750121217006764, 'subsample': 0.8711935510039006, 'log_max_bin': 3.1949625175875354, 'reg_alpha': 5.201043116468452e-10, 'reg_lambda': 0.6849921466924215, 'colsample_bytree': 1.0}}\n{'Current Learner': 'xgboost', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 4.0, 'max_leaves': 10.937997116302148, 'min_child_weight': 9.37137175953572, 'learning_rate': 0.022672907912025912, 'subsample': 0.6, 'reg_alpha': 6.633873770360544e-10, 'reg_lambda': 0.7590775750138271, 'colsample_bylevel': 1.0, 'colsample_bytree': 0.7}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 26.66408504703253, 'max_leaves': 31.775028586333367, 'min_child_weight': 2.101032324057992, 'learning_rate': 0.3750121217006764, 'subsample': 0.8711935510039006, 'log_max_bin': 3.1949625175875354, 'reg_alpha': 5.201043116468452e-10, 'reg_lambda': 0.6849921466924215, 'colsample_bytree': 1.0}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 23.158867908507276, 'max_leaves': 22.03591314927381, 'min_child_weight': 3.8955300015298784, 'learning_rate': 0.24051679800463044, 'subsample': 0.6, 'log_max_bin': 3.0, 'reg_alpha': 3.0792188923459856e-10, 'reg_lambda': 0.9016340605593407, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 26.66408504703253, 'max_leaves': 31.775028586333367, 'min_child_weight': 2.101032324057992, 'learning_rate': 0.3750121217006764, 'subsample': 0.8711935510039006, 'log_max_bin': 3.1949625175875354, 'reg_alpha': 5.201043116468452e-10, 'reg_lambda': 0.6849921466924215, 'colsample_bytree': 1.0}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 30.699835337556014, 'max_leaves': 45.81849795934486, 'min_child_weight': 1.1331800358366897, 'learning_rate': 0.5847162966959815, 'subsample': 1.0, 'log_max_bin': 7.2980887397386915, 'reg_alpha': 8.784971268721483e-10, 'reg_lambda': 0.5204042987675161, 'colsample_bytree': 0.7}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 30.699835337556014, 'max_leaves': 45.81849795934486, 'min_child_weight': 1.1331800358366897, 'learning_rate': 0.5847162966959815, 'subsample': 1.0, 'log_max_bin': 7.2980887397386915, 'reg_alpha': 8.784971268721483e-10, 'reg_lambda': 0.5204042987675161, 'colsample_bytree': 0.7}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 76.23660313632638, 'max_leaves': 66.93360726547702, 'min_child_weight': 1.1930700595990091, 'learning_rate': 0.5482637744255212, 'subsample': 0.6, 'log_max_bin': 10.0, 'reg_alpha': 3.668052110134859e-10, 'reg_lambda': 0.49371485228257217, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 76.23660313632638, 'max_leaves': 66.93360726547702, 'min_child_weight': 1.1930700595990091, 'learning_rate': 0.5482637744255212, 'subsample': 0.6, 'log_max_bin': 10.0, 'reg_alpha': 3.668052110134859e-10, 'reg_lambda': 0.49371485228257217, 'colsample_bytree': 1.0}}\n" + "{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 4, 'max_leaves': 4, 'min_child_weight': 20.0, 'learning_rate': 0.1, 'subsample': 1.0, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 1e-10, 'reg_lambda': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 4, 'max_leaves': 4, 'min_child_weight': 20.0, 'learning_rate': 0.1, 'subsample': 1.0, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 1e-10, 'reg_lambda': 1.0}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 4.0, 'max_leaves': 4.0, 'min_child_weight': 20.0, 'learning_rate': 0.46335414315327306, 'subsample': 0.9339389930838808, 'log_max_bin': 10.0, 'colsample_bytree': 0.9904286645657556, 'reg_alpha': 2.841147337412889e-10, 'reg_lambda': 0.12000833497054482}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 4.0, 'max_leaves': 4.0, 'min_child_weight': 20.0, 'learning_rate': 0.46335414315327306, 'subsample': 0.9339389930838808, 'log_max_bin': 10.0, 'colsample_bytree': 0.9904286645657556, 'reg_alpha': 2.841147337412889e-10, 'reg_lambda': 0.12000833497054482}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 23.0, 'max_leaves': 4.0, 'min_child_weight': 20.0, 'learning_rate': 1.0, 'subsample': 0.9917683183663918, 'log_max_bin': 10.0, 'colsample_bytree': 0.9858892907525497, 'reg_alpha': 3.8783982645515837e-10, 'reg_lambda': 0.36607431863072826}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 23.0, 'max_leaves': 4.0, 'min_child_weight': 20.0, 'learning_rate': 1.0, 'subsample': 0.9917683183663918, 'log_max_bin': 10.0, 'colsample_bytree': 0.9858892907525497, 'reg_alpha': 3.8783982645515837e-10, 'reg_lambda': 0.36607431863072826}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 11.0, 'max_leaves': 17.0, 'min_child_weight': 14.947587304572773, 'learning_rate': 0.6092558236172073, 'subsample': 0.9659256891661986, 'log_max_bin': 10.0, 'colsample_bytree': 1.0, 'reg_alpha': 3.816590663384559e-08, 'reg_lambda': 0.4482946615262561}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 11.0, 'max_leaves': 17.0, 'min_child_weight': 14.947587304572773, 'learning_rate': 0.6092558236172073, 'subsample': 0.9659256891661986, 'log_max_bin': 10.0, 'colsample_bytree': 1.0, 'reg_alpha': 3.816590663384559e-08, 'reg_lambda': 0.4482946615262561}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 7.0, 'max_leaves': 51.0, 'min_child_weight': 20.0, 'learning_rate': 0.8834537640176922, 'subsample': 1.0, 'log_max_bin': 10.0, 'colsample_bytree': 0.9837052481490312, 'reg_alpha': 4.482246955743696e-08, 'reg_lambda': 0.028657570201141073}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 7.0, 'max_leaves': 51.0, 'min_child_weight': 20.0, 'learning_rate': 0.8834537640176922, 'subsample': 1.0, 'log_max_bin': 10.0, 'colsample_bytree': 0.9837052481490312, 'reg_alpha': 4.482246955743696e-08, 'reg_lambda': 0.028657570201141073}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 15.0, 'max_leaves': 165.0, 'min_child_weight': 11.09973081317571, 'learning_rate': 1.0, 'subsample': 0.9847553005974036, 'log_max_bin': 9.0, 'colsample_bytree': 0.9508927355861483, 'reg_alpha': 2.031936014930936e-06, 'reg_lambda': 0.00624701632609755}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 15.0, 'max_leaves': 165.0, 'min_child_weight': 11.09973081317571, 'learning_rate': 1.0, 'subsample': 0.9847553005974036, 'log_max_bin': 9.0, 'colsample_bytree': 0.9508927355861483, 'reg_alpha': 2.031936014930936e-06, 'reg_lambda': 0.00624701632609755}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 6.0, 'max_leaves': 1073.0, 'min_child_weight': 5.630999649172112, 'learning_rate': 0.32864729892819683, 'subsample': 1.0, 'log_max_bin': 10.0, 'colsample_bytree': 0.99236562733598, 'reg_alpha': 1.978160373587824e-09, 'reg_lambda': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 6.0, 'max_leaves': 1073.0, 'min_child_weight': 5.630999649172112, 'learning_rate': 0.32864729892819683, 'subsample': 1.0, 'log_max_bin': 10.0, 'colsample_bytree': 0.99236562733598, 'reg_alpha': 1.978160373587824e-09, 'reg_lambda': 1.0}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 18.0, 'max_leaves': 1846.0, 'min_child_weight': 7.427173668000723, 'learning_rate': 0.1564464373197609, 'subsample': 1.0, 'log_max_bin': 9.0, 'colsample_bytree': 0.9997863921359742, 'reg_alpha': 6.349231150788211e-09, 'reg_lambda': 0.8927146483558472}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 18.0, 'max_leaves': 1846.0, 'min_child_weight': 7.427173668000723, 'learning_rate': 0.1564464373197609, 'subsample': 1.0, 'log_max_bin': 9.0, 'colsample_bytree': 0.9997863921359742, 'reg_alpha': 6.349231150788211e-09, 'reg_lambda': 0.8927146483558472}}\n" ] } ], @@ -370,8 +385,8 @@ "output_type": "display_data", "data": { "text/plain": "
", - "image/svg+xml": "\r\n\r\n\r\n\r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\r\n", - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8GearUAAAgAElEQVR4nO3deXxW9Zn38c+XECQoGBdAFhWogOIGJVVb64ZVcLdWuzjddKaOM1PHeWydaqeddqbjjDNM7fLU1sc6HWtb2yog0kpBW7eOopKIsog4iIgJIKCCiAFCcj1/nBN7E+4kN5CTO7nzfb9evJJzzu/c5zptzJXfrojAzMyspV7FDsDMzLomJwgzM8vLCcLMzPJygjAzs7ycIMzMLC8nCDMzy8sJwmwPSDpF0rJix2GWJScI63YkrZT0kWLGEBF/jIixWX2+pMmSHpe0WdJ6SY9JujCr55nl4wRhloeksiI++1LgXuAuYDgwGPhH4II9+CxJ8n/ntkf8g2MlQ1IvSTdIelnSG5LukXRgzvV7Ja2VtCn96/zonGt3SvqRpNmStgBnpDWVL0tamN7za0l90/KnS6rNub/Vsun1v5e0RtJqSX8hKSQdkecdBNwCfCsi7oiITRHRFBGPRcQX0jLflPTznHtGpJ/XOz1+VNJNkp4A3gW+Kqm6xXP+j6RZ6ff7SPpPSaskvS7pNkkVe/l/h5UAJwgrJX8LXAycBgwF3gJuzbn+O2A0MAh4FvhFi/svB24C+gP/k577ODAFGAkcB3y+jefnLStpCnAd8BHgiDS+1owFDgWmtVGmEJ8BriJ5l/8LjJU0Ouf65cDd6ff/DowBxqfxDSOpsVgP5wRhpeQvgX+IiNqI2AZ8E7i0+S/riPhJRGzOuXa8pP1z7r8/Ip5I/2Lfmp77fkSsjog3gd+Q/BJtTWtlPw78d0QsiYh3gX9q4zMOSr+uKfit87szfd6OiNgE3A98CiBNFEcCs9IayxeA/xMRb0bEZuBfgU/u5fOtBDhBWCk5HLhP0kZJG4GlQCMwWFKZpJvT5qe3gZXpPQfn3P9ans9cm/P9u8B+bTy/tbJDW3x2vuc0eyP9OqSNMoVo+Yy7SRMESe1hZpqsBgL9gJqc/93mpOeth3OCsFLyGnBORFTm/OsbEXUkvxQvImnm2R8Ykd6jnPuzWtp4DUlnc7ND2yi7jOQ9PtZGmS0kv9SbHZKnTMt3eRA4WNJ4kkTR3Ly0AagHjs7532z/iGgrEVoP4QRh3VW5pL45/3oDtwE3STocQNJASRel5fsD20j+Qu9H0ozSWe4BrpB0lKR+tNG+H8n6+9cBX5d0haQBaef7hyXdnhZ7DjhV0mFpE9mN7QUQETtI+jWmAgcCD6Xnm4AfA9+RNAhA0jBJk/f4ba1kOEFYdzWb5C/f5n/fBL4HzAIelLQZeAo4MS1/F/AqUAe8kF7rFBHxO+D7wCPAcmBeemlbK+WnAZ8ArgRWA68D/0LSj0BEPAT8GlgI1AC/LTCUu0lqUPemCaPZV9K4nkqb335P0lluPZy8YZBZ55J0FLAY2KfFL2qzLsU1CLNOIOmjkvpIOoBkWOlvnBysq3OCMOscfwmsB14mGVn1V8UNx6x9bmIyM7O8XIMwM7O8ehc7gI508MEHx4gRI4odhplZt1FTU7MhIvJOjCypBDFixAiqq6vbL2hmZgBIerW1a25iMjOzvJwgzMwsLycIMzPLywnCzMzycoIwM7O8SmoUk5lZTzJzQR1T5y5j9cZ6hlZWcP3ksVw8YViHfb4ThJlZNzRzQR03zlhEfUMjAHUb67lxxiKADksSbmIyM+uGps5d9l5yaFbf0MjUucs67BmuQZiZdSNbGxp5dNk66jbW572+upXze8IJwsysi9va0MhjL63ngYVr+P3S13l3eyO9BE151lodWlnRYc91gjAz64K27Wjk8Zc28MDC1fx+6Tre2baDA/qVc9H4YZx/3BDWbtrK12Yu3qmZqaK8jOsnd9xmgE4QZmZdxPYdTfzP8vX8duEaHlryOpu37aCyXznnHzeE844bwkmjDqK87E9dx2W91H1HMUmaQrJPcBlwR0TcnKfM6cB3gXJgQ0Sclp6vBO4AjgECuDIi5rW838ysO9u+o4knXt7AAwvX8OCStby9dQcD+vZmyjGHcN5xQzj5iIN3Sgq5Lp4wrEMTQkuZJQhJZcCtwFlALTBf0qyIeCGnTCXwQ2BKRKySNCjnI74HzImISyX1AfplFauZWWdqaGziyZff4IGFq5m75HU21TfQv29vzh53COenSaFP7+IPMs2yBnECsDwiVgBI+hVwEfBCTpnLgRkRsQogItalZQcApwKfT89vB7ZnGKuZWaZ2NDYxb8UbPLBwDXOWrGXjuw3st09vzh43mPOOG8KHRx/MPr3Lih3mTrJMEMOA13KOa4ETW5QZA5RLehToD3wvIu4CRpHs3/vfko4HaoBrI2JLy4dIugq4CuCwww7r6HcwM9tjOxqbeOaVN/ntojXMWbyWN7dsZ98+ZXxk3GDOO3YIp44ZSN/yrpUUcmWZIJTnXMtBWb2BicCZQAUwT9JT6fn3A9dExNOSvgfcAHx9lw+MuB24HaCqqsobbJtZp2q53MWXzhrDkMoKHli0mjmL17Lhne3061PGmUclSeH0sV07KeTKMkHUAofmHA8HVucpsyGtGWyR9DhwPPBHoDYink7LTSNJEGZmXUa+5S6uu/d5IBlyOumoQZx/7BBOHzuIij7dIynkyjJBzAdGSxoJ1AGfJOlzyHU/8ANJvYE+JE1Q34mItZJekzQ2IpaR1DBewMysC8m33AXAAf3KeeKGSfTr071nEmQWfUTskPRFYC7JMNefRMQSSVen12+LiKWS5gALgSaSobCL04+4BvhFOoJpBXBFVrGame2J1pa12PhuQ7dPDpDxPIiImA3MbnHuthbHU4Gpee59DqjKMj4zs70xtLIi75pIHbncRTEVf6CtmVk3dcXJI3Y519HLXRSTE4SZ2R56e+sOJDhkQF8EDKus4N8uOTbT2c2dqfs3kpmZFUFTUzC9ppZTRg/kritPKHY4mXANwsxsDzz1yhvUbazn0onDix1KZpwgzMz2wLTq2nT9pMHFDiUzThBmZrtp89YGZi9ewwXHD+02s6L3hBOEmdlu+t2itWxtaCrp5iVwgjAz223TamoZNXBfJhxaWexQMuUEYWa2G1Zu2MIzK9/k0onDkfKtSVo6nCDMzHbDjGdr6SW4ZEJpNy+BE4SZWcGamoLpz9bx4dEDOWT/vsUOJ3NOEGZmBXpqRenPfcjlBGFmVqBpNaU/9yGXE4SZWQGa5z5cWOJzH3I5QZiZFWD2ojU9Yu5DLicIM7MCTKup5X0D92V8ic99yOUEYWbWjpUbtjB/5VtcOvHQkp/7kMsJwsysHdPTuQ8fLZF9HgrlBGFm1obcfR96wtyHXE4QZmZtmLfiDVZv2tqjOqebOUGYmbWhee7DWT1k7kMuJwgzs1Zs3trA73rY3IdcThBmZq14YGEy9+GyqkOLHUpROEGYmbViWk0tRwzaj+OH71/sUIrCCcLMLI9XNmyh+tW3esS+D61xgjAzy2N6Tc+c+5DLCcLMrIXGpmD6s7WcOmYggwf0rLkPuZwgzMxamPfyG6zpoXMfcjlBmJm1MK3mNQb07c1Hjup5cx9yZZogJE2RtEzSckk3tFLmdEnPSVoi6bGc8yslLUqvVWcZp5lZs7e3NjBnyVouHN8z5z7k6p3VB0sqA24FzgJqgfmSZkXECzllKoEfAlMiYpWkQS0+5oyI2JBVjGZmLc1e2LzvQ8+c+5AryxrECcDyiFgREduBXwEXtShzOTAjIlYBRMS6DOMxM2vXvT187kOuLBPEMOC1nOPa9FyuMcABkh6VVCPpsznXAngwPX9VhnGamQGwYv071Lz6Fpf14LkPuTJrYgLy/a8beZ4/ETgTqADmSXoqIl4CTo6I1Wmz00OSXoyIx3d5SJI8rgI47LDDOvQFzKxn6an7PrQmyxpELZDbiDccWJ2nzJyI2JL2NTwOHA8QEavTr+uA+0iarHYREbdHRFVEVA0cOLCDX8HMeorGpmDGs3WcNmYgg3rw3IdcWSaI+cBoSSMl9QE+CcxqUeZ+4BRJvSX1A04ElkraV1J/AEn7AmcDizOM1cx6uCdf3pDOfXDndLPMmpgiYoekLwJzgTLgJxGxRNLV6fXbImKppDnAQqAJuCMiFksaBdyXtgH2Bu6OiDlZxWpmNq2mlv0ryjnzqJaDKXuuLPsgiIjZwOwW525rcTwVmNri3ArSpiYzs6y9vbWBOYvX8vGqQ3v83Idc7TYxSTqwMwIxMyuWBxauYduOph6/tEZLhfRBPC3pXknnyuO+zKwETaupZfSg/TjOcx92UkiCGAPcDnwGWC7pXyWNyTYsM7PO8XI696En7/vQmnYTRCQeiohPAX8BfA54RtJjkj6YeYRmZhnyvg+ta7eTWtJBwKdJahCvA9eQDFcdD9wLjMwyQDOzrHjuQ9sKaWKaBwwALo6I8yJiRkTsiIhq4LZ27jUz67KeWL6BtW9v5bIqz33Ip5BhrmMjouUSGQBExL93cDxmZp3Gcx/aVkgN4sF0WW4AJB0gaW6GMZmZZW5TfQNzl6zlovFD2ae35z7kU0iCGBgRG5sPIuItwOnWzLo1z31oXyFNTI2SDmves0HS4ey6KquZdbCZC+qYOncZqzfWM7Sygusnj+Vij7TpMNNqXmPM4P04dpjnPrSmkATxD8D/5GwHeirp8tpmlo2ZC+q4ccYi6hsaAajbWM9Xpi1k1RvvctpYr1q8tza8s41nV23kq+ce6bkPbVAr/c87F5IOBk4i2eNhXlfdBrSqqiqqq719tXV/J9/8MHUb64sdRkkrLxNPfGVSjx/eKqkmIqryXSt0sb5GYB3QFxgniXyb95jZ3lv1xrttJoeffD7vf8u2mw4ZUNHjk0N7Cpko9xfAtSQb/jxHUpOYB0zKNjSznmVTfQM/fGQ5//3ESkT+jr5hlRVMOnJwZ4dmPVQho5iuBT4AvBoRZwATgPWZRmXWg+xobOJn81Zyxn8+yu1/XMGF44fyjQvGUdFi2emK8jKunzy2OEFaj1RIE9PWiNgqCUn7RMSLkvxTaraXIoJHl63nptlLWb7uHU4ceSBfP38cx6Sjair79fEoJiuqQhJEbTpRbibwkKS32HVvaTPbDUvXvM2/zl7KH/93AyMP3pfbPzORs8YN3mlEzcUThjkhWFG1myAi4qPpt9+U9AiwP+DtP832wLrNW7nlwZe4p/o1+vct5x/PH8enTzqcPr2z3B7ebM+0mSAk9QIWRsQxABHxWFvlzSy/rQ2N3PHHFfzo0ZfZtqOJz39oJH975hFU9utT7NDMWtVmgoiIJknP586kNrPCNTUFs55fzX/MeZHVm7Zy9rjB3HDOkYwauF+xQzNrVyF9EEOAJZKeAbY0n4yICzOLyqwEzF/5Jv/y2xd4vnYTRw8dwLc/Pp4Pvu+gYofV43kJk8IVkiD+KfMozErIqjfe5eY5S5m9aC2DB+zDf152PJdMGEavXl7SodjyLWFy44xFAE4SeRTSSe1+B7MCbKpv4NZHlnPnEysp6yX+7iOjuerUUfTrU+iCBZa1qXOXvZccmtU3NDJ17jIniDwKmUm9mT9N6uwDlANbImJAloGZdRcNjU388plVfOehl9hY38Cl7x/OlyePZbCXcehyVreyhElr53u6QmoQ/XOPJV0MnJBZRGbdRETwyLJ13PTAUl5ev4WTRh3I187700Q363qGVlbkXedqaGVFEaLp+nZ78HVEzMTrMFkPt3TN23zmv57hyjuraQr48Wer+OUXTnJy6OKunzzWS5jshkKamC7JOewFVOENg6yHap7o9uvq1xjQt5xvXDCOPzvRE926i+Z+Bo9iKkwhvWcX5Hy/A1gJXJRJNGZdVP32dKLbYy/T0NjElSeP5JpJnujWHXkJk8IV0gdxRWcEYtYVNTUF9z9fx3/MWcaaTVuZfPRgbjjnKEYevG+xQzPLXCFNTD8Fro2IjenxAcC3I+LKrIMzK6ZnXnmTf3ngBRbWbuLYYfvz3U+M58RRnuhmPUchTUzHNScHgIh4S9KEQj5c0hTge0AZcEdE3JynzOnAd0mGz26IiNNyrpUB1UBdRJxfyDPN9tarb2zh5t+9yO8Wr+WQAX255ePHc/F4T3SznqeQBNFL0gER8RaApAMLuS/95X4rcBZQC8yXNCsiXsgpUwn8EJgSEaskDWrxMdcCSwHPubDMbapv4AcP/y93PrmS3r16cd1ZY/jCKaOo6FPW/s1mJaiQBPFt4ElJ00hGL30cuKmA+04AlkfECgBJvyLp3H4hp8zlwIzmhQAjYl3zBUnDgfPSZ11XwPPM9khDYxN3P72K7/4+meh22cThfOlsT3QzK6ST+i5J1SRzHwRcklsLaMMw4LWc41rgxBZlxgDlkh4F+gPfi4i70mvfBf4+Pd8qSVcBVwEcdthhBYRllogIHn5xHTfNXsqK9Vv44KiD+Nr5R3H0UM9lMIPCmopOApZExA/S4/6SToyIp9u7Nc+5lvMnegMTgTOBCmCepKdIEse6iKhJ+yhaFRG3A7cDVFVVeX6GFeSF1W9z0+wXeGL5G4w6eF/u+GwVZx41aKcd3cx6ukKamH4EvD/neEuec/nUAofmHA9n161Ka0k6prcAWyQ9DhyffvaFks4F+gIDJP08Ij5dQLwlzUsV7511b2/l2w++xD01r7F/RTnfvGAcf3bS4ZSXeaKbWUuFJAhFxHt/maebCBVy33xgtKSRQB3wSZI+h1z3Az9IP68PSRPUdyLiXuBGeG+U05edHLxU8d5oOdHtz08eyTWTRrN/v/Jih2bWZRXyi36FpL8lqTUA/DWwor2bImKHpC8Cc0mGuf4kIpZIujq9fltELJU0B1gINJEMhV28Jy/SE7S2VPH1057nzidXFieobqJuYz3rN29jytGHcMM5RzLCE93M2qWcykH+AsnQ0++TdFIH8AeSiXPrsw9v91RVVUV1dXWxw8jMyBseaHURrFPHDOzUWLqbfuVlXHHyCE90M2tBUk1EVOW7VsgopnUkzUPNH1YBnA/c22ERWkFaW6p4WGUFd13pFdjNrGMV1DMnqUzSOZLuAl4BPpFtWJbPX546apdzXqrYzLLSZg1C0qkkHcvnAc8AJwOjIuLdTojNWni+dhNlvcRB+/Zh/eZt7Y5i8ognM9sbrSYISbXAKpLO6esjYrOkV5wciqPm1beY/mwtV5/2Pm4458h2y3vEk5ntrbaamKaTzIb+BHCBpH3xRkFF0dgUfHPWEgYP2IdrJh1R0D1tbc5uZlaIVhNERFwLjABuAc4AXgIGSvq4pP06JzwDuKf6NRbVbeKr5x7FvvsUMjJ59zZnn7mgjpNvfpiRNzzAyTc/zMwFdXsVr5mVhjY7qSPxcER8gSRZXA5cTLKrnHWCTe82MHXuMk4YcSAXHj+04Pta24S95fnmpqi6jfUEf2qKcpIws4LXF4iIhoj4TURczs5LaFiGbnloGRvf3c43Lzx6t9YJKnRzdjdFmVlrCmuvaCEi8rdfWIdauuZtfvbUq/zZiYczbujubYlR6Obsu9MUZWY9yx4lCMteRNIxvX9FOV86e8wefUYhm7O3NvmutSYqM+s5vIRlF/XbhWt4+pU3+fLksVT265PZcwptijKznqeQ/SDGANcDh+eWj4hJGcbVo727fQf/OnspxwwbwCc/kO0mSIU2RWXBE/nMurZCmpjuBW4Dfgw0tlPWOsCtjyxnzaat/ODyCZT1yn4Dm0KaojqaJ/KZdX2FJIgdEfGj9otZR1i5YQs/fvwVLpkwjImHH1jscDLT1ugpJwizrqGQBPEbSX8N3Adsaz4ZEW9mFlUP9q3fvkCf3r0KWk6jO+us0VNuxjLbc4UkiM+lX6/PORfArkuL2l555MV1/OHFdXz13CMZNKBvscPJVGeMnnIzltneaXcUU0SMzPPPyaGDbdvRyD/9ZgmjBu7L5z80stjhZK4zRk95EmDp8zIx2SpkFFM58FfAqempR4H/FxENGcbVYzQ3gTT/NX31aaPo07v0Rx93xugpTwIsba4hZq+QJqYfAeXAD9Pjz6Tn/iKroHqKlj/gAD998lWOPGRAj/gBz3r0lCcBljYPdMheIX+qfiAiPpcu2vdwRFwBfCDrwHoCN4Fky5MAS5triNkrJEE0Snpf84GkUXg+RIfwD3i2Lp4wjH+75FiGVVYgkr27/+2SY/3XZYkodMVi23OFNDFdDzwiaQUgkhnVV2QaVQ/hJpDsFWMSoHWO6yeP3aWJ1jXEjtVugoiIP0gaDYwlSRAvRsS2dm6zAnzp7DF86Z7nd9qmzz/g7fPcBoPiLhPTU7S1J/WkiHhY0iUtLr1PEhExI+PYSt7QygoCOKBfORvfbfAPeAE8csVyuYaYrbZqEKcBDwMX5LkWgBPEXppeU8t++/TmyRvOpKJPWfs3mEeumHWiVhNERHwj/fafI+KV3GuSSn8mV8be3b6D2YvWcN5xQ5wcdoM79s06TyGjmKbnOTetowPpaeYuWcuW7Y187P3Dix1Kt+KRK2adp60+iCOBo4H9W/RDDABKe6GgTjC9po5DD6zgAyNKd8XWLHjkilnnaasPYixwPlDJzv0Qm4EvZBlUqVu9sZ4nXt7A304aTa9O2O+hlHjkilnnaasP4n7gfkkfjIh5nRhTybtvQR0RuHlpD3nkilnnKGSi3AJJf0PS3PRe01JEXNnejZKmAN8DyoA7IuLmPGVOB75Lst7Thog4TVJf4HFgnzTGaTmd5t1aRDC9ppYTRhzIYQf1K3Y4ZmatKqST+mfAIcBk4DFgOEkzU5sklQG3AucA44BPSRrXokwlySKAF0bE0cBl6aVtwKSIOB4YD0yRdFJBb9TFLXhtIys2bOFjE/0XsJl1bYUkiCMi4uvAloj4KXAecGwB950ALI+IFRGxHfgVcFGLMpcDMyJiFUBErEu/RkS8k5YpT/8FJWB6TS19y3tx7rFDih2KmVmbCkkQzfs+bJR0DLA/MKKA+4YBr+Uc16bnco0BDpD0qKQaSZ9tviCpTNJzwDrgoYh4Ot9DJF0lqVpS9fr16wsIq3i2NjTym+dXM/noQ+jft7zY4ZiZtamQBHG7pAOArwOzgBeA/yjgvnzDc1rWAnoDE0lqJZOBr0saAxARjRExnqRJ64Q0Oe36gRG3R0RVRFQNHDiwgLCK5w9L1/H21h3unDazbqGQxfruSL99jN3bh7oWODTneDiwOk+ZDRGxBdgi6XHgeOClnOdvlPQoMAVYvBvP73KmP1vLIQP6cvIRBxc7FDOzdrU1Ue66tm6MiFva+ez5wOh0WY464JMkfQ657gd+IKk30Ac4EfiOpIFAQ5ocKoCPAP/ezvO6tPWbt/HYS+v5wimjKPPcBzPrBtqqQfRPv44l2UFuVnp8AckQ1DZFxA5JXwTmkgxz/UlELJF0dXr9tohYKmkOsBBoIhkKu1jSccBP05FQvYB7IuK3e/B+Xcb9z9XR2BRc6tFLZtZNKKLtwUGSHgQ+FhGb0+P+wL0RMaUT4tstVVVVUV1dXeww8pry3cfZp3cv7v/ih4sdipnZeyTVRERVvmuFdFIfBmzPOd5OYaOYLLVk9SZeXLuZj01057SZdR+FzKT+GfCMpPtIRiF9FLgr06hKzPSaOvqU9eKC44YWOxQzs4IVMorpJkm/A05JT10REQuyDat0NDQ2cf9zdZx51CAO2LdPscMxMytYW6OYBkTE25IOBFam/5qvHRgRb2YfXvf32LL1vLFlu+c+mFm301YN4m6S5b5r2HmCm9Lj3ZkT0ePMXFDH1LnLqNtYTy/BpvqG9m8yM+tC2lru+/z0q7cX3U0zF9TttKlNU8DXZi6mrJe8TLWZdRttNTG9v60bI+LZjg+nNEydu2ynHc8A6hsamTp3WckmiOYakzfxsY7in6nia6uJ6dttXAtgUgfHUjLqNtbnPb+6lfPdXcsaU93Gem6csQjA/0HbHvHPVNfQVhPTGZ0ZSCloagp+/vSr73XStDS0sqKzQ+oUPbHGZNnyz1TXUMg8CNKVVMex845yJTEXoqOqsSvWv8NXpi9k/sq3GDu4Pyvf2MK2HU3vXa8oL+P6yWM7MvQuo7WaUanWmCx7/pnqGtpNEJK+AZxOkiBmk+wQ9z+UwGS5jqjG7mhs4sd/fIXv/P4l+vbuxdRLj+PSicO5/7nVPab9dGhlRd5mtVKtMVn2/DPVNRRSg7iUZAnuBRFxhaTBwB3t3NMttFaN/easJbyzbUe790cE91TXsqhuE5OPHsy3LjqGQQOSStbFE4aVbEJo6frJY3dKtFDaNSbLnn+muoZCEkR9RDRJ2iFpAMkObyUxB6K16urG+ga+NrOwrScO3q8Pt17+fs499hCknrmMd3Mi7Ck1Jsuef6a6hkISRLWkSuDHJJPm3gGeyTSqTtJaNfaQAX2Zdc3JBX3G/hXl7NO7rKND63Z6Uo3JOod/poqvrXkQPwDujoi/Tk/dlu7dMCAiFnZKdBlrrRp7wzlHMqh/3zbuNDMrfW3VIP4X+LakIcCvgV9GxHOdE1bncDXWzKx1hWwYdDjJdqGfJBnm+kvgVxHxUps3FkFX3jDIzKwr2qsNgyLi1Yj494iYQLKn9EeBpR0co5mZdTHtJghJ5ZIukPQL4HfAS8DHMo/MzMyKqq1O6rOATwHnkYxa+hVwVURs6aTYzMysiNrqpP4qyZ4QX/bmQGZmPY8X6zMzs7za7YMwM7OeyQnCzMzycoIwM7O8nCDMzCwvJwgzM8vLCcLMzPJygjAzs7wyTRCSpkhaJmm5pBtaKXO6pOckLZH0WHruUEmPSFqanr82yzjNzGxXhWwYtEcklQG3AmcBtcB8SbMi4oWcMpXAD4EpEbFK0qD00g7gSxHxrKT+QI2kh3LvNTOzbGVZgzgBWB4RKyJiO8laThe1KHM5MCMiVgFExLr065qIeDb9fjPJ6rHepMHMrBNlmSCGAa/lHNey6y/5McABkh6VVCPpsy0/RNIIYALwdL6HSLpKUrWk6vXr13dI4GZmlm2CUJ5zLXcn6g1MJFkxdsb+kOUAAAoxSURBVDLwdUlj3vsAaT9gOvB3EfF2vodExO0RURURVQMHDuyYyM3MLLs+CJIaw6E5x8OB1XnKbEiXEN8i6XHgeOAlSeUkyeEXETEjwzjNzCyPLGsQ84HRkkZK6kOyZemsFmXuB06R1FtSP+BEYKkkAf8FLI2IWzKM0czMWpFZDSIidkj6IjAXKAN+EhFLJF2dXr8tIpZKmgMsBJqAOyJisaQPA58BFkl6Lv3Ir0bE7KziNTOznSmiZbdA91VVVRXV1dXFDsPMrNuQVBMRVfmueSa1mZnl5QRhZmZ5OUGYmVleThBmZpaXE4SZmeXlBGFmZnk5QZiZWV5OEGZmlpcThJmZ5eUEYWZmeTlBmJlZXk4QZmaWlxOEmZnl5QRhZmZ5OUGYmVleThBmZpaXE4SZmeXlBGFmZnk5QZiZWV5OEGZmlpcThJmZ5dW72AF0VzMX1DF17jJWb6xnaGUF108ey8UThhU7LDOzDuMEsQdmLqjjxhmLqG9oBKBuYz03zlgE4CRhZiXDTUx7YOrcZe8lh2b1DY1MnbusSBGZmXU8J4g9sHpj/W6dNzPrjpwg9sDQyordOm9m1h05QeyB6yePpaK8bKdzFeVlXD95bJEiMjPreO6k3gPNHdEexWRmpcwJYg9dPGGYE4JZgTwsvHtygjCzTHlYePeVaR+EpCmSlklaLumGVsqcLuk5SUskPZZz/ieS1klanGWMZpYtDwvvvjJLEJLKgFuBc4BxwKckjWtRphL4IXBhRBwNXJZz+U5gSlbxmVnn8LDw7ivLGsQJwPKIWBER24FfARe1KHM5MCMiVgFExLrmCxHxOPBmhvGZWSfwsPDuK8sEMQx4Lee4Nj2XawxwgKRHJdVI+uzuPkTSVZKqJVWvX79+L8I1syx4WHj3lWUntfKcizzPnwicCVQA8yQ9FREvFfqQiLgduB2gqqqq5eebWZF5WHj3lWWCqAUOzTkeDqzOU2ZDRGwBtkh6HDgeKDhBmFnX52Hh3VOWTUzzgdGSRkrqA3wSmNWizP3AKZJ6S+oHnAgszTAmMzMrUGYJIiJ2AF8E5pL80r8nIpZIulrS1WmZpcAcYCHwDHBHRCwGkPRLYB4wVlKtpD/PKlYzM9uVIkqn2b6qqiqqq6uLHYaZWbchqSYiqvJd82J9ZmaWlxOEmZnlVVJNTJLWA68WO44OcjCwodhBZMzvWBr8jt3b4RExMN+FkkoQpURSdWvtgqXC71ga/I6ly01MZmaWlxOEmZnl5QTRdd1e7AA6gd+xNPgdS5T7IMzMLC/XIMzMLC8nCDMzy8sJogvIt72qpAMlPSTpf9OvBxQzxr0l6VBJj0hamm4ve216vmTeU1JfSc9Iej59x39Kz5fMO0KyW6SkBZJ+mx6X2vutlLQo3Qq5Oj1XUu9YKCeIruFOdt1e9QbgDxExGvhDetyd7QC+FBFHAScBf5NuQVtK77kNmBQRxwPjgSmSTqK03hHgWnZedbnU3g/gjIgYnzP3oRTfsV1OEF1AK9urXgT8NP3+p8DFnRpUB4uINRHxbPr9ZpJfMMMoofeMxDvpYXn6Lyihd5Q0HDgPuCPndMm8Xxt6wjvuwgmi6xocEWsg+eUKDCpyPB1G0ghgAvA0JfaeafPLc8A64KGIKLV3/C7w90BTzrlSej9IkvqD6TbIV6XnSu0dC5LljnJmu5C0HzAd+LuIeFvKtzNt9xURjcB4SZXAfZKOKXZMHUXS+cC6iKiRdHqx48nQyRGxWtIg4CFJLxY7oGJxDaLrel3SEID067oix7PXJJWTJIdfRMSM9HTJvSdARGwEHiXpWyqVdzwZuFDSSuBXwCRJP6d03g+AiFidfl0H3AecQIm9Y6GcILquWcDn0u8/R7I9a7elpKrwX8DSiLgl51LJvKekgWnNAUkVwEeAFymRd4yIGyNieESMINlC+OGI+DQl8n4AkvaV1L/5e+BsYDEl9I67wzOpu4B0e9XTSZYUfh34BjATuAc4DFgFXBYRLTuyuw1JHwb+CCziT+3XXyXphyiJ95R0HEkHZhnJH1/3RMQ/SzqIEnnHZmkT05cj4vxSej9Jo0hqDZA0wd8dETeV0jvuDicIMzPLy01MZmaWlxOEmZnl5QRhZmZ5OUGYmVleThBmZpaXE4R1G5K+I+nvco7nSroj5/jbkq5r4/47JV2afv+opF02oZdULunmdNXOxenqrOek11ZKOngP4n7vua1cvzVdOfQFSfXp989JulTS7Oa5FR1J0pDm1Vhbud5H0uOSvNpCD+YEYd3Jk8CHACT1Ipk3cnTO9Q8BT+zlM74FDAGOiYhjgAuA/nv5mW2KiL+JiPHAucDL6Sqi4yNiWkScm87K7mjXAT9uI6btJKuWfiKDZ1s34QRh3ckTpAmCJDEsBjZLOkDSPsBRwAJJ/yhpfloDuF0FLvgkqR/wBeCaiNgGEBGvR8Q9ecpel37+4ha1ms9KWpjuCfGzPPd9K61RFPTfXnOtRdIISS9KuiN95i8kfUTSE2lt54S0/L5K9heZr2TPhota+eiPAXPSe45Oa0rPpbGPTsvMBP6skDitNLn6aN1GuoDaDkmHkSSKeSRLhn8Q2AQsjIjtkn4QEf8MkP6SPh/4TQGPOAJYFRFvt1VI0kTgCuBEQMDTkh4DtgP/QLLY2wZJB7a47z+A/YErYs9mqB4BXAZcBcwHLgc+DFxIMiv94vT5D0fElWnT1DOSfh8RW3LiGAm81ZwEgauB70XELyT1IZkJDkkC/sAexGklwjUI626aaxHNCWJezvGTaZkzJD0taREwiZ2boTrCh4H7ImJLuv/DDOCU9FnTImIDQIulGL4OVEbEX+5hcgB4JSIWRUQTsIRkA5sgWb5kRFrmbOAGJUuOPwr0JVkeItcQYH3O8Tzgq5K+AhweEfVp/I3A9ua1iazncYKw7qa5H+JYkr9wnyKpQXwIeEJSX+CHwKURcSxJO3vfAj97OXBYAb8QW2uyEsleAvnMBya2rFXspm053zflHDfxp9YAAR/L6cc4LCJyd38DqCfnf5OIuJukFlIPzJU0KafsPsDWvYjZujEnCOtuniBpMnozIhrTv9IrSZLEPP70i2+Dkr0nWh091FJEvEuy4uz306aW5tE+n25R9HHgYkn90hU/P0qyEOEfgI+nC7vRIhnMAW4GHsj4L/K5wDXN/S6SJuQp8xJ/qnE0L1C3IiK+T7Jq6XHp+YOA9RHRkGG81oU5QVh3s4hk9NJTLc5tiogN6YifH6fnZpL85b47vkbS/PKCpMXpZ+Q2x5BunXon8AzJarR3RMSCiFgC3AQ8Jul54JYW992bxjYrXQ48C98i2ep0YRr/t1oWSPsjXpZ0RHrqE8DitFnqSOCu9PwZwOyM4rRuwKu5mvVAkj4KTIyIr7VRZgZwY0Qs67zIrCvxKCazHigi7mtuCssnbWKb6eTQs7kGYWZmebkPwszM8nKCMDOzvJwgzMwsLycIMzPLywnCzMzy+v9OnZKRbllNjQAAAABJRU5ErkJggg==\n" + "image/svg+xml": "\r\n\r\n\r\n\r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\r\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8GearUAAAgAElEQVR4nO3deXxV1bn/8c9DCBDmMcyTMglOYASsI6gFWwdsbSs41AnqbW1t7aVXe29va3u9HfjdttZqLeA8VlvEoQq1ZbBVZhkEJRiQIQRMAoQhJGR6fn/sHTyEk3CAnJzk5Pt+vfJK9trrnP1shvNkrbXXWubuiIiIVNUk0QGIiEj9pAQhIiJRKUGIiEhUShAiIhKVEoSIiESlBCEiIlEpQYicADO70MwyEx2HSDwpQUiDY2abzeyyRMbg7v9098Hxen8zG2dm75jZfjPLM7OFZnZ1vK4nEo0ShEgUZpaSwGtfB7wMPA30AroC/w1cdQLvZWam/+dyQvQPR5KGmTUxs3vNbKOZ7TKzl8ysY8T5l81sp5ntDX87HxZx7kkz+4OZvWlmhcCYsKXy72a2JnzNn8ysRVj/EjPLjnh9tXXD8z8wsx1mlmNmd5iZm9mAKPdgwK+Bn7n7THff6+4V7r7Q3SeHdX5iZs9GvKZf+H5Nw+MFZvaAmb0LHAR+aGbLq1zne2b2WvhzczP7f2a21cw+NbNHzSztJP86JAkoQUgy+Q4wAbgY6AHsAR6OOP8WMBBIB94Hnqvy+knAA0Ab4F9h2VeB8UB/4EzglhquH7WumY0H7gEuAwaE8VVnMNAb+HMNdWJxEzCF4F4eAgab2cCI85OA58OffwkMAs4O4+tJ0GKRRk4JQpLJN4D/dPdsdz8E/AS4rvI3a3d/3N33R5w7y8zaRbz+VXd/N/yNvTgs+52757j7buB1gg/R6lRX96vAE+6+zt0PAvfX8B6dwu87Yr7r6J4Mr1fm7nuBV4GJAGGiGAK8FrZYJgPfc/fd7r4f+F/g+pO8viQBJQhJJn2BV8yswMwKgI+AcqCrmaWY2S/C7qd9wObwNZ0jXr8tynvujPj5INC6hutXV7dHlfeOdp1Ku8Lv3WuoE4uq13ieMEEQtB5mh8mqC9ASWBHx5zYnLJdGTglCksk24Ap3bx/x1cLdtxN8KF5D0M3TDugXvsYiXh+vpY13EAw2V+pdQ91Mgvv4cg11Cgk+1Ct1i1Kn6r38DehsZmcTJIrK7qV8oAgYFvFn1s7da0qE0kgoQUhDlWpmLSK+mgKPAg+YWV8AM+tiZteE9dsAhwh+Q29J0I1SV14CbjWz08ysJTX073uw/v49wI/M7FYzaxsOvl9gZtPDaquAi8ysT9hFdt+xAnD3MoJxjWlAR+DtsLwCmAH8xszSAcysp5mNO+G7laShBCEN1ZsEv/lWfv0EeBB4Dfibme0HFgOjwvpPA1uA7cCH4bk64e5vAb8D5gNZwKLw1KFq6v8Z+BpwG5ADfAr8D8E4Au7+NvAnYA2wAngjxlCeJ2hBvRwmjEr/Eca1OOx++zvBYLk0cqYNg0TqlpmdBqwFmlf5oBapV9SCEKkDZnatmTUzsw4Ej5W+ruQg9Z0ShEjd+AaQB2wkeLLq3xIbjsixqYtJRESiUgtCRESiaproAGpT586dvV+/fokOQ0SkwVixYkW+u0edGJlUCaJfv34sX7782BVFRAQAM9tS3bm4djGZ2XgzyzSzLDO7t5o6l5jZKjNbZ2YLw7LBYVnl1z4z+248YxURkSPFrQURrqf/MHA5kA0sM7PX3P3DiDrtgUeA8e6+tXImp7tnEi50Fr7PduCVeMUqIiJHi2cLYiSQ5e6b3L0EeJFgLZxIk4BZ7r4VwN1zo7zPpcBGd6+2GSQiIrUvngmiJ0euKJkdlkUaBHQINzhZYWY3R3mf64EX4hSjiIhUI56D1BalrOqki6bAOQSthDRgkZktdvcNAGbWDLiaGhYjM7MpBBuj0KdPn1oIW0REIL4JIpsjlzXuRbDwWNU6+e5eCBSa2TvAWcCG8PwVwPvu/ml1F3H36cB0gIyMDM36E5FGY/bK7Uybm0lOQRE92qcxddxgJgyv2lFz4uLZxbQMGGhm/cOWwPUEK21GehW40MyahssgjyLY5KXSRNS9JCJylNkrt3PfrA/YXlCEA9sLirhv1gfMXrm91q4RtwQRLkR2FzCX4EP/JXdfZ2Z3mtmdYZ2PCHavWgMsBWa6+1qAMGFcDsyKV4wiIg3VtLmZFJWWH1FWVFrOtLmZtXaNuE6Uc/c3Cdbtjyx7tMrxNIJNTKq+9iCf7c8rIiKhXQcOsb2gKOq5nGrKT0RSzaQWEUlW7s6qbQU8s2gLb6zZUW29Hu3Tau2aShAiIvVYUUk5r6/O4enFm1m7fR+tmqVw/cjedG/Xgt/9I+uIbqa01BSmjqu9zQCVIERE6qHN+YU8u3gLL6/IZm9RKQPTW/Oza4Zx7YhetG4efHR3b5cW16eYlCBEROqJ8gpnQWYuTy/awsINeTRtYowb1o2bzuvLqP4dMTtyetmE4T1rNSFUpQQhIpJguwtL+NOybTy3ZAvZe4pIb9Oc7142kIkj+9C1bYuExaUEISKSAIcHnRcHg84lZRWM6t+R+644jc8P60pqSuL3c1OCEBGpQ8Wl5by2OodnFm3hg+17adUsha9l9Oam8/oyqGubRId3BCUIEZE6sGVXMOj80vLPBp1/es0wrh3ekzYtUhMdXlRKECIicVJe4Szc8NmgcxMzxg3ryk2j+zH6lKMHnesbJQgRkVq2u7CEl5YHg87bdgeDzt8ZGww6d2uXuEHn46UEISJSS1ZtK+DpRZuPGHT+j/FDGDesW70YdD5eShAiIiehuDSY6fzM4i2syQ4Gnb+a0YubRvdjcLf6Neh8vJQgREROwNZdB3l2yRZeWr6NgoOlDGgAg87HSwlCRCRGlYPOzyzawoIGOOh8vJQgRESOYU846PxsOOjcpU1zvj12IJMa2KDz8VKCEBGpxuptBTy9aAuvr8mhpKyCkQ180Pl4KUGIiESoHHR+dvEWVmfvpWU46Hzj6L4M6dY20eHVKSUIERGCQefnlmzhTxGDzvdfPYwvjUieQefjpQQhIo1WRYWzcEMeTy/afHjQ+fNDu3LTeX0575ROSTfofLyUIESk0alu0HniyN50b1d7W3Y2dEoQItJorA6X1359dQ6HwkHnH4wLBp2bNU3+QefjpQQhIkmtuLScN9bs4JlFmw8POl93Ti9uOq/xDTofLyUIEUlK23YfDJfX3saeg6Wc2qUV9189jGtH9KRtIx10Pl5KECLSIM1euZ1pczPJKSiiR/s0po4bzNVn9WDhx3k8s2gL8zNzPxt0Ht2X807VoPPxMndPdAy1JiMjw5cvX57oMEQkzmav3M59sz6gqLT8cFnTJka7tFR2FZbQuXVzJo3szcRRfTTofAxmtsLdM6KdUwtCRBqcaXMzj0gOAGUVzv7iMh6aOFyDzrVECUJEGpztBUVRy0vLK7jqrB51HE3yUoIQkQZjyaZd/H5+VrXne7RXd1JtUoIQkXrN3flXVj4P/SOLpZt307l1M64+qwd/W7eT4rKKw/XSUlOYOm5wAiNNPkoQIlIvuTvz1ufy0LwsVm0roFvbFvz4qqFMHNmHFqkpUZ9imjC8Z6LDTipKECJSr1RUOHPX7eSheVl8uGMfvTqk8cC1p3PdOb1o3jTlcL0Jw3sqIcSZEoSI1AvlFc4ba3J4eH4WGz49QP/OrZh23ZlMGN6zUey9UB8pQYhIQpWWVzB75XYeWbCRT/ILGZjemgevP5srz+xBShNNbEskJQgRSYhDZeX8eUU2f1iwkew9RQzt3pY/3DCCccO60USJoV6Ia4Iws/HAg0AKMNPdfxGlziXAb4FUIN/dLw7L2wMzgdMBB25z90XxjFdE4q+4tJwXlm7ljws3sXNfMWf1bs/9Vw9j7JB0LYVRz8QtQZhZCvAwcDmQDSwzs9fc/cOIOu2BR4Dx7r7VzNIj3uJBYI67X2dmzYCW8YpVROKv8FAZzy7ewox/fkL+gUOM7NeRaV85kwsGdFZiqKfi2YIYCWS5+yYAM3sRuAb4MKLOJGCWu28FcPfcsG5b4CLglrC8BCiJY6wiEif7ikt5+r3NPPavT9hzsJQLBnTm22OHM+qUTokOTY4hngmiJ7At4jgbGFWlziAg1cwWAG2AB939aeAUIA94wszOAlYAd7t7YRzjFZFatKewhCfe/YQn3tvM/uIyxg5J566xAxjRp0OiQ5MYxTNBRGszVl06tilwDnApkAYsMrPFYfkI4NvuvsTMHgTuBX501EXMpgBTAPr06VN70YvICck/cIgZ/9zEs4u2UFhSzrhhXfn22IGc3rNdokOT4xTPBJEN9I447gXkRKmTH7YMCs3sHeAs4J9AtrsvCev9mSBBHMXdpwPTIVjuu/bCF5HjsXNvMX98ZyMvLN3KobIKrjyzB3eNGcDgbm0SHZqcoHgmiGXAQDPrD2wHricYc4j0KvB7M2sKNCPogvqNu+80s21mNtjdMwlaGB8iIglXdYmL2y/oz8a8A7y8PJtydyac3ZNvjjmVU7u0TnSocpLiliDcvczM7gLmEjzm+ri7rzOzO8Pzj7r7R2Y2B1gDVBA8Crs2fItvA8+FTzBtAm6NV6wiEpuqG/VsLyjip298SEoT46sZvfm3i0+lTyc9cJgstKOciMRs1AN/59P9h44q79q2OUt+eFkCIpKTpR3lROSEHCorZ8mm3czPzGX++tyoyQEgd1/0cmnYlCBE5Ag5BUVhQsjj3ax8ikrLada0CZ87tRN7Dpayt6j0qNdoo57kpAQh0siVlVfw/taCw62E9Tv3A9CzfRpfPqcnY4ekc94pnUlrlnLUGARoo55kpgQh0gjtOnCIhRvymLc+l3c25LGvuIyUJkZG3w7cd8UQxgxJZ2B666OWwKjcf0Eb9TQOShAijUBFhbMuZx/z1ucyPzOX1dkFuEPn1s34/LBujBmczgUDO9MuLfWY76WNehoPJQiRJLWvuJR/fZzP/PW5LNiQR97+Q5jBmb3ac/elAxk7JJ3Te7TT0tpSLSUIkSTh7mTlHmB+Zi7z1ueyfPMeyiqcti2actGgLowZnM7Fg7vQuXXzRIcqDcQxE4SZdXT33XURjIgcn6KSchZv2nW46yh7TxEAQ7q14Y4LT2HskHRG9GlPU23ZKScglhbEEjNbBTwBvOXJNLNOpAHatvvg4SeO3tu4i0NlFaSlpnD+gE782yWncsngdHrqsVOpBbEkiEHAZcBtwENm9ifgSXffENfIRASAkrIKlm/ZzYLM4KmjrNwDAPTt1JKJI/swZkg6o/p3pEVqSoIjlWRzzAQRthjeBt42szHAs8A3zWw1cK+2ARWpfbn7ilmQmcf8zFz++XE+Bw6VkZpijOrfievP7c3YIen079xKO7FJXMUyBtEJuBG4CfiUYBG914CzgZeB/vEMUKQxKK9wVmcXsGB9LvMyc1m7fR8QrHF05ZndGTMknfMHdKZ1cz1XInUnln9ti4BngAnunh1RvtzMHo1PWCLJr+BgCe+Ej6Eu3JDH7sISmhiM6NOBqeMGM2ZwOqd1b6NWgiRMLAlicHUD0+7+y1qORyRpuTvrd+5n3vpcFmTmsmLLHiocOrRM5eJBXRgzJJ2LBnahQ6tmiQ5VBIgtQfzNzL7i7gUAZtYBeNHdx8U3NJGGr/BQGe9m5TM/M48Fmbns2FsMwLAebfnWmAFcMjids3u3J0WT1aQeiiVBdKlMDgDuvsfM0uMYk0iD9kl+IfPDeQlLNu2mpLyC1s2bcsGAznz3si5cMjidrm1bJDpMkWOKJUGUm1kfd98KYGZ9Ac2FEAkdKitn6Se7w66jPD7JLwTg1C6tuPm8vowdkk5Gv440a6rJatKwxJIg/hP4l5ktDI8vAqbELySR+m/H3iLmrw8eQ303K5+DJZ/tmXDL5/oxZnC6tt6UBi+WeRBzzGwEMBow4Hvunh/3yETqkbLyClZuKwiWtKiyZ8KXRhy5Z4JIsoj1oepyIBdoAQw1M9z9nfiFJZJ4lXsmzM/M450NeewtKo1pzwSRZBHLRLk7gLuBXsAqgpbEImBsfEMTqVuVeyZUroYauWfC5UO7HteeCSLJIJYWxN3AucBidx9jZkOA++Mblkjd2Fdcyrsf5wcDzNozQeQIsSSIYncvNjPMrLm7rzczbUArDcLslduP2B7z3z8/iDN6tQvHEvJYtnm39kwQqUYsCSLbzNoDswkW7NsD5MQ3LJGTN3vldu6b9QFFpeUAbC8o4p6XVh9+Rlt7JojULJanmK4Nf/yJmc0H2gFz4hqVSC341Zz1h5NDJQfap6Xy17sv1J4JIsdQY4IwsybAGnc/HcDdF9ZUX6Q+KC2v4C8rsskJl7Woam9RqZKDSAxqTBDuXmFmqyNnUovUV6XlFcx6P5uH5mWRvaeI1BSjtPzoSf89lBxEYhLLGER3YJ2ZLQUKKwvd/eq4RSVyHMrKK5i1cju/n5fF1t0HObNXO356zTD2Hizlh6+sPaKbKS01hanj9IyFSCxiSRB6pFXqpbLyCmavyuGheR+zZddBTu/Zlpk3Z3DpaemHJ6+Z2RFPMU0dN5gJw3smOHKRhiGWQWqNO0i9Ul7hvLpqOw/Ny+KT/EKGdm/L9JvO4fKhXY+a1TxheE8lBJETFMtM6v18tnprMyAVKHT3tvEMTKSq8grn9dU5/O4fH7Mpv5Ah3drw6I3nMG7Y0YlBRE5eLC2INpHHZjYBGBm3iKTRqzq57fuXDyIlxfjdPz5mY14hg7u24Q83jGDcsG6a4SwSR8e9A7q7zzaze+MRjEi0yW3ffzmY3Daoa2seuWEE45UYROpELF1MX4o4bAJkoA2DJE6mzc2MOrmtQ8tU5tx9kRKDSB2KpQVxVcTPZcBm4JpY3tzMxgMPAinATHf/RZQ6lwC/JRjbyHf3i8PyzcB+gqXGy9w9I5ZrSsOWU1AUtbzgYKmSg0gdi2UM4tYTeWMzSwEeBi4HsoFlZvaau38YUac98Agw3t23Rtnreow2J2pcerRPY3uUJKHJbSJ175irk5nZU+EHeeVxBzN7PIb3Hglkufsmdy8BXuTolsckYFblLG13z409dElGU8cNpnmVvZs1uU0kMWJZvvJMdy+oPHD3PcDwGF7XE9gWcZwdlkUaBHQwswVmtsLMbo4458DfwvJq98A2sylmttzMlufl5cUQltRnE4b3ZMyQzxqSPdun8fMvnaG5DCIJEMsYRBMz6xAmBsysY4yvi9ZhXHVwuylwDnApkAYsMrPF7r4BON/dc8Jup7fNbH20bU7dfTowHSAjI0OD50lgY+4BRvbvyEvfOC/RoYg0arG0IP4PeM/MfmZmPwXeA34Vw+uygd4Rx704eh+JbGCOuxeGYw3vAGcBuHtO+D0XeAXNvWgUMnfu5+PcA1x1ZvdEhyLS6B0zQbj708CXgU+BPOBL7v5MDO+9DBhoZv3NrBlwPfBalTqvAheaWVMzawmMAj4ys1Zm1gbAzFoBnwfWxnpT0nC9sSaHJgbjT1eCEEm0WOZBjAbWufvvw+M2ZjbK3ZfU9Dp3LzOzu4C5BI+5Pu7u68zszvD8o+7+kZnNAdYAFQSPwq41s1OAV8LlE5oCz7u7NilKcu7OG2t2cN6pnejSRlt+iiRaLGMJfwBGRBwXRimLyt3fBN6sUvZoleNpwLQqZZsIu5qk8ViXs49P8guZctEpiQ5FRIhtDMLc/fDgr7tXcAJLdIgcyxtrdpDSxBg/rFuiQxERYksQm8zsO2aWGn7dDWyKd2DSuATdSzmcP6AzHVo1S3Q4IkJsCeJO4HPAdoKnjkYBk+MZlDQ+q7P3kr2niCv19JJIvRHLUhu5BE8gAWBmacCVwMtxjEsamTdW55CaYowbqu4lkfoilhYEZpZiZleY2dPAJ8DX4huWNCYVFc6bH+zgooFdaNcyNdHhiEioxhaEmV1EsF7SF4GlwPnAKe5+sA5ik0Zi5bY95OwtZup4rbckUp9UmyDMLBvYSvBI61R3329mnyg5SG17ffUOmjVtwmWndU10KCISoaYupr8QLK73NeCqcEaz1jqSWlUedi+NGdyFNi3UvSRSn1SbINz9bqAf8GtgDLAB6GJmXzWz1nUTniS7ZZt3k7v/EF88s0eiQxGRKmocpPbAPHefTJAsJgETCHaVEzlpb6zJoUVqEy4dUnWvKBFJtJhnRLt7KfA68Hr4qKvISSkrr+CtD3Zy6ZCutGquyfki9U1Mj7lW5e7RNw4WOQ6LN+1mV2GJJseJ1FMnlCBEasMba3Jo1SzliB3kRKT+UIKQhCgtr2DOup1cNrQrLVJTEh2OiEQRy34Qg4CpQN/I+u4+No5xSZJ7NyufgoOlXKmnl0TqrVhGBl8GHgVmAOXxDUcaizfW7KBN86ZcNKhzokMRkWrEkiDK3P0PcY9EGoXZK7fzqznrydlbTFpqCm99sJMJw3smOiwRiSKWBPG6mX0TeAU4VFno7rvjFpUkpdkrt3PfrA8oKg0aokWl5dw36wMAJQmReiiWBPH18PvUiDIHtC+kHJdpczMPJ4dKRaXlTJubqQQhUg/Fsh9E/7oIRJJfTkH06TPVlYtIYsXyFFMq8G/ARWHRAuCP4cxqkZh1bNWMXYUlR5X3aK+J+SL1USxdTH8AUoFHwuObwrI74hWUJJ9/fZzP3qJSjCOXBE5LTWHqOO0DIVIfxZIgznX3syKO55nZ6ngFJMnnvY353P7UMgakt+am0X15ZMFGcgqK6NE+janjBmv8QaSeiiVBlJvZqe6+EcDMTkHzISRGSzbt4vYnl9O3U0ueu2MUnVo354bRfRMdlojEIJYEMRWYb2abACOYUX1rXKOSpLB8825ufXIZPdq34Lk7RtOpdfNEhyQixyGWp5j+YWYDgcEECWK9ux86xsukkXt/6x5ueWIZ3dq24IXJo+nSRslBpKGpaU/qse4+z8y+VOXUqWaGu8+Kc2zSQK3eVsDXH1tKp9bNeH7yaNLbtkh0SCJyAmpqQVwMzAOuinLOASUIOcra7Xu56bEltGuZyvOTR9OtnZKDSENVbYJw9x+HP/7U3T+JPGdmmjwnR/kwZx83PraENi1SeWHyaHpqfoNIgxbLfhB/iVL259oORBq2zJ37ufGxJaSlpvDC5NH07tgy0SGJyEmqaQxiCDAMaFdlHKItoH4DOSwrdz83zFxMaorx/OTR9Omk5CCSDGoagxgMXAm058hxiP3A5HgGJQ3HxrwDTJyxBLMgOfTv3CrRIYlILalpDOJV4FUzO8/dF9VhTNJAbM4vZNKMxVRUOC9OGc2pXVonOiQRqUWxTJRbaWbfIuhuOty15O63xS0qqfe27jrIxBmLKS13Xpg8moFd2yQ6JBGpZbEMUj8DdAPGAQuBXgTdTMdkZuPNLNPMsszs3mrqXGJmq8xsnZktrHIuxcxWmtkbsVxP6sa23UFyKCot59nbRzG4m5KDSDKKJUEMcPcfAYXu/hTwReCMY73IzFKAh4ErgKHARDMbWqVOe4JVYq9292HAV6q8zd3ARzHEKHUkp6CISTMXs7+4lGdvH8XQHm0THZKIxEksCaJy34cCMzsdaAf0i+F1I4Esd9/k7iXAi8A1VepMAma5+1YAd8+tPGFmvQiS0cwYriV1YOfeYibOWExBYSnP3D6K03u2S3RIIhJHsSSI6WbWAfgR8BrwIfCrGF7XE9gWcZwdlkUaBHQwswVmtsLMbo4491vgB0BFTRcxsylmttzMlufl5cUQlpyI3H3FTJqxmF0HSnjq9pGc1bt9okMSkTiLZbG+yt/gF3J8+1BbtLeLcv1zgEuBNGCRmS0mSBy57r7CzC45RnzTgekAGRkZVd9fakHe/kNMnLGYnfuKefq2kYzo0yHRIYlIHahpotw9Nb3Q3X99jPfOBnpHHPcCcqLUyXf3QqDQzN4BzgJGAFeb2RcInpxqa2bPuvuNx7im1LJdBw5xw8zF5BQU8+St55LRr2OiQxKROlJTF1Ob8CuDYE/qnuHXnQSDzseyDBhoZv3NrBlwPUEXVaRXgQvNrKmZtQRGAR+5+33u3svd+4Wvm6fkUPf2FJZww8wlbNl1kMe+nsGoUzolOiQRqUM1TZS7H8DM/gaMcPf94fFPgJeP9cbuXmZmdwFzgRTgcXdfZ2Z3hucfdfePzGwOsIZgrGGmu689yXuSWrD3YCk3PraETfmFPPb1DD43oHOiQxKROmbuNXfbm9l64KzKTYLMrDmw2t2H1EF8xyUjI8OXL1+e6DAavL1Fpdz02BLW79jP9JvP4ZLB6YkOSUTixMxWuHtGtHOxzKR+BlhqZq8QDDJfCzxdi/FJPbK/uJSvP76Uj3bs49EblRxEGrNYnmJ6wMzeAi4Mi25195XxDUsS4cChMm55Yhlrt+/l4RtGcOlpXRMdkogkUE1PMbV1931m1hHYHH5Vnuvo7rvjH57UlYMlZdz2xDJWbSvg9xOHM25Yt0SHJCIJVlML4nmC5b5XcOT8BQuPj2dOhNRjRSXl3PbkMpZv2c2D1w/nijO6JzokEakHanqK6crwu7YXTWLFpeVMfno5Sz7ZzW++ejZXndUj0SGJSD1RUxfTiJpe6O7v1344UpeKS8uZ8swK3t2Yz7TrzmLC8KoroYhIY1ZTF9P/1XDOgbG1HIvUoUNl5Xzzufd5Z0Mev/zyGVx3Tq9EhyQi9UxNXUxj6jIQqTslZRXc9fxK5q3P5YFrT+dr5/ZJdEgiUg/FMg+CcJnvoRy5o5zmQjRApeUVfOeFlbz94af89Jph3DCqb6JDEpF66pgJwsx+DFxCkCDeJNgA6F9oslyDU1ZewXf/tIo563byoyuHcvN5/RIdkojUY7HsB3EdwXLcO939VoLVVpvHNSqpdeUVzvdfXs1f1+zgh18Ywu0X6OE0EalZLAmiyN0rgDIzawvkojkQDUp5hTP1z6t5dVUOU8cNZspFpyY6JBFpAGIZg1ge7h09g2DS3AFgaVyjklpTUeHcN2sNs97fzj2XD+JbYwYkOiQRaSBqmgfxe+B5d/9mWPRouDR3W3dfUyfRyUmpqHD+c/ZaXlqezXfGDuA7lw5MdEgi0n8j1W4AAA9FSURBVIDU1IL4GPg/M+sO/Al4wd1X1U1YcrLcnR+/to4Xlm7lm5ecyvcuH5TokESkgal2DMLdH3T384CLgd3AE2b2kZn9t5np06Yec3d++saHPLN4C1MuOoWp4wZjFm2LcBGR6h1zkNrdt7j7L919ODCJYD+Ij+IemZwQd+d/3/yIJ97dzK3n9+O+K4YoOYjICTlmgjCzVDO7ysyeA94CNgBfjntkctzcnV/OyWTGPz/h5vP68t9XDlVyEJETVtMg9eXAROCLBE8tvQhMcffCOopNjtOv397Aows3MmlUH+6/epiSg4iclJoGqX9IsCfEv2tzoPrvwb9/zEPzsvhaRm/+55rTlRxE5KRpsb4k8PD8LH7z9w18eUQvfv6lM2jSRMlBRE5eLDOppR7748KNTJubyYSze/Cr685UchCRWqME0YDN/Ocmfv7Weq48szv/7ytnkaLkICK1SAmigXrqvc38z18/4orTu/Hbr51N0xT9VYpI7dKnSgP07OIt/Pi1dVw+tCu/mzhcyUFE4kKfLA3Mi0u38l+z13LpkHQenjSCVCUHEYkTfbo0IC8v38Z9r3zAxYO68MiNI2jWVH99IhI/+oRpIF5Zmc0P/rKG80/tzB9vOofmTVMSHZKIJDkliAbgtdU5fP+l1Yzu34kZN2fQIlXJQUTiTwminvvrmh1870+ryOjXkcduySCtmZKDiNQNJYh6bO66ndz94kqG927P47ecS8tmsWwAKCJSO5Qg6qm/f/gpdz3/Pmf0ascTt55L6+ZKDiJSt5Qg6qH5mbl887n3Oa17W566bSRtWqQmOiQRaYSUIOqZdzbk8Y1nVjCwa2ueuW0UbZUcRCRB4pogzGy8mWWaWZaZ3VtNnUvMbJWZrTOzhWFZCzNbamarw/L74xlnffFuVj6Tn17OqV1a8+zto2jXUslBRBInbh3bZpYCPAxcDmQDy8zsNXf/MKJOe+ARYLy7bzWz9PDUIWCsux8ws1TgX2b2lrsvjle8ibZ40y5uf2oZfTu15NnbR9KhVbNEhyQijVw8WxAjgSx33+TuJQQ70l1Tpc4kYJa7bwVw99zwu7v7gbBOavjlcYw1oZZt3s1tTy6jV4eWPHfHaDq1bp7okERE4pogegLbIo6zw7JIg4AOZrbAzFaY2c2VJ8wsxcxWAbnA2+6+JNpFzGyKmS03s+V5eXm1fAvxt2LLHm55fCnd2rbg+TtG0aWNkoOI1A/xTBDRNieo2gpoCpxDsO/1OOBHZjYIwN3L3f1soBcw0sxOj3YRd5/u7hnuntGlS5fai74OrNpWwC2PL6Vzm+Y8P3k06W1bJDokEZHD4pkgsoHeEce9gJwodea4e6G75wPvAGdFVnD3AmABMD5+oda9D7L3cvNjS2jfKpUXJo+mWzslBxGpX+KZIJYBA82sv5k1A64HXqtS51XgQjNramYtgVHAR2bWJRzAxszSgMuA9XGMtU6ty9nLjY8toU2LIDn0aJ+W6JBERI4St6eY3L3MzO4C5gIpwOPuvs7M7gzPP+ruH5nZHGANUAHMdPe1ZnYm8FT4JFQT4CV3fyNesdal9Tv3cePMJbRqlsILk0fTq0PLRIckIhKVuSfPw0EZGRm+fPnyRIdRrY8/3c/10xfTNMX405Tz6Ne5VaJDEpFGzsxWuHtGtHOaSV1HsnIPMHHGEpo0MV6YPFrJQUTqPSWIOvBJfiGTZiwGnBcmj+KULq0THZKIyDFpidA427KrkInTF1NW4bw4ZTQD0tskOiQRkZioBRFH23YfZNKMJRSXlfPs7aMY1FXJQUQaDiWIONleUMTEGYvZX1zKs7ePYmiPtokOSUTkuKiLKQ527C1i4vTF7C0q5bk7RnF6z3aJDklE5LipBVHLPt1XzKQZS9hdWMLTt43kzF7tEx2SiMgJUYKoRbn7i5k4YzG5+4p56rZzGd6nQ6JDEhE5YepiqiX5Bw5xw4wl7Cgo5qnbRnJO346JDklE5KSoBVELdheWcOPMJWzbc5DHbzmXkf2VHESk4VOCOEkFB4Pk8El+ITNvPpfzTu2U6JBERGqFEsRJ2FtUyk2PLSUr9wDTb87ggoGdEx2SiEitUYI4QfuKS7n58aWs37mPR28awcWDGtZmRSIix6JB6uMwe+V2ps3NZHtBEc1SmlBWUcEfb8pg7JCuiQ5NRKTWKUHEaPbK7dw36wOKSssBKCmvoFlKEwoPlSU4MhGR+FAXU4ymzc08nBwqlZRXMG1uZoIiEhGJLyWIGG0vKIpanlNNuYhIQ6cEEYP563Oxas5pP2kRSVZKEDVwdx5ZkMVtTy2jR/s0mjc98o8rLTWFqeMGJyg6EZH4UoKoxsGSMu56YSW/mpPJF8/ozt/vuZhffvlMerZPw4Ce7dP4+ZfOYMLwnokOVUQkLvQUUxTbdh9kyjMrWL9zH/8xfgh3XnwKZsaE4T2VEESk0VCCqOK9rHy+9fz7lFU4T9xyLpcMTk90SCIiCdHoE0Tk5Ld2aansLy7llC6tmXFzBv07t0p0eCIiCdOoE0TVyW97i0ppYnDHBf2VHESk0WvUg9TRJr9VODw0LytBEYmI1B+NOkFUN8lNk99ERBp5gqhukpsmv4mINPIEMXXcYNJSU44o0+Q3EZFAox6krpzTMG1uJjkFRfRon8bUcYM110FEhEaeIABNfhMRqUaj7mISEZHqKUGIiEhUShAiIhKVEoSIiESlBCEiIlGZuyc6hlpjZnnAlmNU6wzk10E49Ulju2fdb3LT/dauvu7eJdqJpEoQsTCz5e6ekeg46lJju2fdb3LT/dYddTGJiEhUShAiIhJVY0wQ0xMdQAI0tnvW/SY33W8daXRjECIiEpvG2IIQEZEYKEGIiEhUjSpBmNl4M8s0sywzuzfR8dQ2M3vczHLNbG1EWUcze9vMPg6/d0hkjLXJzHqb2Xwz+8jM1pnZ3WF5Ut6zmbUws6Vmtjq83/vD8qS830pmlmJmK83sjfA42e93s5l9YGarzGx5WJaQe240CcLMUoCHgSuAocBEMxua2Khq3ZPA+Cpl9wL/cPeBwD/C42RRBnzf3U8DRgPfCv9Ok/WeDwFj3f0s4GxgvJmNJnnvt9LdwEcRx8l+vwBj3P3siPkPCbnnRpMggJFAlrtvcvcS4EXgmgTHVKvc/R1gd5Xia4Cnwp+fAibUaVBx5O473P398Of9BB8iPUnSe/bAgfAwNfxykvR+AcysF/BFYGZEcdLebw0Scs+NKUH0BLZFHGeHZcmuq7vvgOADFUhPcDxxYWb9gOHAEpL4nsPullVALvC2uyf1/QK/BX4AVESUJfP9QpD0/2ZmK8xsSliWkHtuTDvKWZQyPeObBMysNfAX4Lvuvs8s2l91cnD3cuBsM2sPvGJmpyc6pngxsyuBXHdfYWaXJDqeOnS+u+eYWTrwtpmtT1QgjakFkQ30jjjuBeQkKJa69KmZdQcIv+cmOJ5aZWapBMnhOXefFRYn9T0DuHsBsIBgzClZ7/d84Goz20zQJTzWzJ4lee8XAHfPCb/nAq8QdI8n5J4bU4JYBgw0s/5m1gy4HngtwTHVhdeAr4c/fx14NYGx1CoLmgqPAR+5+68jTiXlPZtZl7DlgJmlAZcB60nS+3X3+9y9l7v3I/j/Os/dbyRJ7xfAzFqZWZvKn4HPA2tJ0D03qpnUZvYFgj7NFOBxd38gwSHVKjN7AbiEYHngT4EfA7OBl4A+wFbgK+5edSC7QTKzC4B/Ah/wWR/1DwnGIZLuns3sTIIByhSCX+5ecvefmlknkvB+I4VdTP/u7lcm8/2a2SkErQYIhgCed/cHEnXPjSpBiIhI7BpTF5OIiBwHJQgREYlKCUJERKJSghARkaiUIEREJColCGkwzOw3ZvbdiOO5ZjYz4vj/zOyeGl7/pJldF/68wMyO2gjezFLN7Bfhqplrw9VTrwjPbTazzicQ9+HrVnP+4XDlzg/NrCj8eZWZXWdmb1bOfahNZta9cnXUas43M7N3zKwxrbYgVShBSEPyHvA5ADNrQjDfY1jE+c8B757kNX4GdAdOd/fTgauANif5njVy92+5+9nAF4CN4SqeZ7v7n939C+Gs6dp2DzCjhphKCFYN/Vocri0NhBKENCTvEiYIgsSwFthvZh3MrDlwGrDSzP7bzJaFLYDpFuPiTGbWEpgMfNvdDwG4+6fu/lKUuveE77+2SqvmZjNbE+7Z8EyU1/0sbFHE9H+vstViZv3MbL2ZzQyv+ZyZXWZm74atnZFh/VYW7AuyzII9FKpbsfjLwJzwNcPCltKqMPaBYZ3ZwA2xxCnJSc1HaTDCBczKzKwPQaJYRLAi73nAXmCNu5eY2e/d/acA4Yf0lcDrMVxiALDV3ffVVMnMzgFuBUYRLAK5xMwWAiXAfxIstpZvZh2rvO5XQDvgVj+xGaoDgK8AUwiWjpkEXABcTTCDfEJ4/XnuflvYNbXUzP7u7oURcfQH9lQmQeBO4EF3fy5chiYlLF8LnHsCcUqSUAtCGprKVkRlglgUcfxeWGeMmS0xsw+AsRzZDVUbLgBecffCcH+GWcCF4bX+7O75AFWWQvgR0N7dv3GCyQHgE3f/wN0rgHUEG8g4wVIj/cI6nwfutWBJ8AVAC4LlGSJ1B/IijhcBPzSz/wD6untRGH85UFK5NpA0PkoQ0tBUjkOcQfAb7mKCFsTngHfNrAXwCHCdu59B0M/eIsb3zgL6xPCBWF2XlVH9EvLLgHOqtiqO06GInysijiv4rDfAgC9HjGP0cffI3dgAioj4M3H35wlaIUXAXDMbG1G3OVB8EjFLA6YEIQ3NuwRdRrvdvTz8Lb09QZJYxGcffPkW7BNR7dNDVbn7QYLVYX8XdrVUPu1zY5Wq7wATzKxluOLmtQSLBv4D+Gq4sBpVksEc4BfAX+P8G/lc4NuV4y5mNjxKnQ181uKoXCBuk7v/jmDV0DPD8k5AnruXxjFeqceUIKSh+YDg6aXFVcr2unt++MTPjLBsNsFv7sfjvwi6Xz40s7Xhe0R2xxBuc/oksJRg5diZ7r7S3dcBDwALzWw18Osqr3s5jO21cLnuePgZwVaka8L4f1a1QjgesdHMBoRFXwPWht1SQ4Cnw/IxwJtxilMaAK3mKtIImdm1wDnu/l811JkF3OfumXUXmdQneopJpBFy91cqu8KiCbvYZis5NG5qQYiISFQagxARkaiUIEREJColCBERiUoJQkREolKCEBGRqP4/CJ4U6RCkPy8AAAAASUVORK5CYII=\n" }, "metadata": { "needs_background": "light" @@ -435,7 +450,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": { "slideshow": { "slide_type": "slide" @@ -443,46 +458,86 @@ }, "outputs": [], "source": [ - "''' BaseEstimator is the parent class for a customized learner '''\n", - "from flaml.model import BaseEstimator\n", - "from flaml.space import ConfigSearchInfo\n", - "''' import the RGF implementation from rgf.sklearn module'''\n", + "''' SKLearnEstimator is the super class for a sklearn learner '''\n", + "from flaml.model import SKLearnEstimator\n", + "from flaml import tune\n", "from rgf.sklearn import RGFClassifier, RGFRegressor\n", "\n", "\n", - "class MyRegularizedGreedyForest(BaseEstimator):\n", + "class MyRegularizedGreedyForest(SKLearnEstimator):\n", "\n", - " # search space\n", - " params_configsearch_info = {\n", - " 'max_leaf': ConfigSearchInfo(name = 'max_leaf', type = int, lower = 4, init = 4, upper = 10000),\n", - " 'n_iter': ConfigSearchInfo(name = 'n_iter', type = int, lower = 1, init = 1, upper = 32768),\n", - " 'n_tree_search': ConfigSearchInfo(name = 'n_tree_search', type = int, lower = 1, init = 1, upper = 32768),\n", - " 'opt_interval': ConfigSearchInfo(name = 'opt_interval', type = int, lower = 1, init = 100, upper = 10000),\n", - " 'learning_rate': ConfigSearchInfo(name = 'learning_rate', type = float, lower = 0.01, init = 1.0, upper = 20.0),\n", - " 'min_samples_leaf': ConfigSearchInfo(name = 'min_samples_leaf', type = int, lower = 1, init = 20, upper = 20)\n", - " }\n", - " \n", - " def __init__(self, objective_name = 'binary:logistic', n_jobs = 1, max_leaf = 1000, \n", - " n_iter = 1, n_tree_search = 1, opt_interval = 1, learning_rate = 1.0, min_samples_leaf = 1):\n", "\n", - " '''regression for RGFRegressor; binary:logistic and multiclass for RGFClassifier'''\n", - " self.objective_name = objective_name\n", + " def __init__(self, task = 'binary:logistic', n_jobs = 1, **params):\n", + " '''Constructor\n", + " \n", + " Args:\n", + " task: A string of the task type, one of\n", + " 'binary:logistic', 'multi:softmax', 'regression'\n", + " n_jobs: An integer of the number of parallel threads\n", + " params: A dictionary of the hyperparameter names and values\n", + " '''\n", "\n", - " if 'regression' in objective_name:\n", + " super().__init__(task, **params)\n", + "\n", + " '''task=regression for RGFRegressor; \n", + " binary:logistic and multiclass:softmax for RGFClassifier'''\n", + " if 'regression' in task:\n", " self.estimator_class = RGFRegressor\n", " else:\n", " self.estimator_class = RGFClassifier\n", "\n", - " # round integer hyperparameters\n", + " # convert to int for integer hyperparameters\n", " self.params = {\n", " \"n_jobs\": n_jobs,\n", - " 'max_leaf': int(round(max_leaf)),\n", - " 'n_iter': int(round(n_iter)),\n", - " 'n_tree_search': int(round(n_tree_search)),\n", - " 'opt_interval': int(round(opt_interval)),\n", - " 'learning_rate': learning_rate,\n", - " 'min_samples_leaf':int(round(min_samples_leaf))\n", - " } \n" + " 'max_leaf': int(params['max_leaf']),\n", + " 'n_iter': int(params['n_iter']),\n", + " 'n_tree_search': int(params['n_tree_search']),\n", + " 'opt_interval': int(params['opt_interval']),\n", + " 'learning_rate': params['learning_rate'],\n", + " 'min_samples_leaf':int(params['min_samples_leaf'])\n", + " } \n", + "\n", + " @classmethod\n", + " def search_space(cls, data_size, task):\n", + " '''[required method] search space\n", + "\n", + " Returns:\n", + " A dictionary of the search space. \n", + " Each key is the name of a hyperparameter, and value is a dict with\n", + " its domain and init_value (optional), cat_hp_cost (optional) \n", + " e.g., \n", + " {'domain': tune.randint(lower=1, upper=10), 'init_value': 1}\n", + " '''\n", + " space = { \n", + " 'max_leaf': {'domain': tune.qloguniform(lower = 4, upper = data_size, q = 1), 'init_value': 4},\n", + " 'n_iter': {'domain': tune.qloguniform(lower = 1, upper = data_size, q = 1), 'init_value': 1},\n", + " 'n_tree_search': {'domain': tune.qloguniform(lower = 1, upper = 32768, q = 1), 'init_value': 1},\n", + " 'opt_interval': {'domain': tune.qloguniform(lower = 1, upper = 10000, q = 1), 'init_value': 100},\n", + " 'learning_rate': {'domain': tune.loguniform(lower = 0.01, upper = 20.0)},\n", + " 'min_samples_leaf': {'domain': tune.qloguniform(lower = 1, upper = 20, q = 1), 'init_value': 20},\n", + " }\n", + " return space\n", + "\n", + " @classmethod\n", + " def size(cls, config):\n", + " '''[optional method] memory size of the estimator in bytes\n", + " \n", + " Args:\n", + " config - the dict of the hyperparameter config\n", + "\n", + " Returns:\n", + " A float of the memory size required by the estimator to train the\n", + " given config\n", + " '''\n", + " max_leaves = int(round(config['max_leaf']))\n", + " n_estimators = int(round(config['n_iter']))\n", + " return (max_leaves*3 + (max_leaves-1)*4 + 1.0)*n_estimators*8\n", + "\n", + " @classmethod\n", + " def cost_relative2lgbm(cls):\n", + " '''[optional method] relative cost compared to lightgbm\n", + " '''\n", + " return 1.0\n" ] }, { @@ -500,7 +555,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": { "slideshow": { "slide_type": "slide" @@ -508,14 +563,13 @@ }, "outputs": [], "source": [ - "''' add a new learner RGF'''\n", "automl = AutoML()\n", "automl.add_learner(learner_name = 'RGF', learner_class = MyRegularizedGreedyForest)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": { "slideshow": { "slide_type": "slide" @@ -527,68 +581,121 @@ "output_type": "stream", "name": "stderr", "text": [ - "[flaml.automl: 12-15 07:42:43] {660} INFO - List of ML learners in AutoML Run: ['RGF', 'lgbm', 'rf', 'xgboost']\n", - "[flaml.automl: 12-15 07:42:43] {665} INFO - Evaluation method: holdout\n", - "[flaml.automl: 12-15 07:42:43] {683} INFO - Minimizing error metric: 1-accuracy\n", - "[flaml.automl: 12-15 07:42:45] {327} INFO - Using StratifiedKFold\n", - "[flaml.automl: 12-15 07:42:45] {728} INFO - iteration 0 current learner RGF\n", - "[flaml.automl: 12-15 07:42:47] {793} INFO - at 4.0s,\tbest RGF's error=0.3764,\tbest RGF's error=0.3764\n", - "[flaml.automl: 12-15 07:42:47] {728} INFO - iteration 1 current learner RGF\n", - "[flaml.automl: 12-15 07:42:52] {793} INFO - at 8.7s,\tbest RGF's error=0.3764,\tbest RGF's error=0.3764\n", - "[flaml.automl: 12-15 07:42:52] {728} INFO - iteration 2 current learner lgbm\n", - "[flaml.automl: 12-15 07:42:52] {793} INFO - at 8.9s,\tbest lgbm's error=0.3790,\tbest RGF's error=0.3764\n", - "[flaml.automl: 12-15 07:42:52] {728} INFO - iteration 3 current learner lgbm\n", - "[flaml.automl: 12-15 07:42:53] {793} INFO - at 9.3s,\tbest lgbm's error=0.3790,\tbest RGF's error=0.3764\n", - "[flaml.automl: 12-15 07:42:53] {728} INFO - iteration 4 current learner lgbm\n", - "[flaml.automl: 12-15 07:42:53] {793} INFO - at 9.8s,\tbest lgbm's error=0.3718,\tbest lgbm's error=0.3718\n", - "[flaml.automl: 12-15 07:42:53] {728} INFO - iteration 5 current learner lgbm\n", - "[flaml.automl: 12-15 07:42:53] {793} INFO - at 10.0s,\tbest lgbm's error=0.3652,\tbest lgbm's error=0.3652\n", - "[flaml.automl: 12-15 07:42:53] {728} INFO - iteration 6 current learner lgbm\n", - "[flaml.automl: 12-15 07:42:54] {793} INFO - at 10.5s,\tbest lgbm's error=0.3652,\tbest lgbm's error=0.3652\n", - "[flaml.automl: 12-15 07:42:54] {728} INFO - iteration 7 current learner lgbm\n", - "[flaml.automl: 12-15 07:42:55] {793} INFO - at 11.8s,\tbest lgbm's error=0.3652,\tbest lgbm's error=0.3652\n", - "[flaml.automl: 12-15 07:42:55] {728} INFO - iteration 8 current learner lgbm\n", - "[flaml.automl: 12-15 07:42:57] {793} INFO - at 14.0s,\tbest lgbm's error=0.3568,\tbest lgbm's error=0.3568\n", - "[flaml.automl: 12-15 07:42:57] {728} INFO - iteration 9 current learner lgbm\n", - "[flaml.automl: 12-15 07:43:02] {793} INFO - at 18.1s,\tbest lgbm's error=0.3547,\tbest lgbm's error=0.3547\n", - "[flaml.automl: 12-15 07:43:02] {728} INFO - iteration 10 current learner lgbm\n", - "[flaml.automl: 12-15 07:43:07] {793} INFO - at 23.2s,\tbest lgbm's error=0.3522,\tbest lgbm's error=0.3522\n", - "[flaml.automl: 12-15 07:43:07] {728} INFO - iteration 11 current learner xgboost\n", - "[flaml.automl: 12-15 07:43:07] {793} INFO - at 23.9s,\tbest xgboost's error=0.3764,\tbest lgbm's error=0.3522\n", - "[flaml.automl: 12-15 07:43:07] {728} INFO - iteration 12 current learner xgboost\n", - "[flaml.automl: 12-15 07:43:08] {793} INFO - at 24.7s,\tbest xgboost's error=0.3671,\tbest lgbm's error=0.3522\n", - "[flaml.automl: 12-15 07:43:08] {728} INFO - iteration 13 current learner xgboost\n", - "[flaml.automl: 12-15 07:43:09] {793} INFO - at 26.0s,\tbest xgboost's error=0.3671,\tbest lgbm's error=0.3522\n", - "[flaml.automl: 12-15 07:43:09] {728} INFO - iteration 14 current learner lgbm\n", - "[flaml.automl: 12-15 07:43:18] {793} INFO - at 34.7s,\tbest lgbm's error=0.3522,\tbest lgbm's error=0.3522\n", - "[flaml.automl: 12-15 07:43:18] {728} INFO - iteration 15 current learner rf\n", - "[flaml.automl: 12-15 07:43:19] {793} INFO - at 35.3s,\tbest rf's error=0.4323,\tbest lgbm's error=0.3522\n", - "[flaml.automl: 12-15 07:43:19] {728} INFO - iteration 16 current learner rf\n", - "[flaml.automl: 12-15 07:43:19] {793} INFO - at 36.0s,\tbest rf's error=0.4033,\tbest lgbm's error=0.3522\n", - "[flaml.automl: 12-15 07:43:19] {728} INFO - iteration 17 current learner RGF\n", - "[flaml.automl: 12-15 07:43:28] {793} INFO - at 44.7s,\tbest RGF's error=0.3764,\tbest lgbm's error=0.3522\n", - "[flaml.automl: 12-15 07:43:28] {728} INFO - iteration 18 current learner xgboost\n", - "[flaml.automl: 12-15 07:43:29] {793} INFO - at 45.4s,\tbest xgboost's error=0.3602,\tbest lgbm's error=0.3522\n", - "[flaml.automl: 12-15 07:43:29] {728} INFO - iteration 19 current learner xgboost\n", - "[flaml.automl: 12-15 07:43:31] {793} INFO - at 47.3s,\tbest xgboost's error=0.3544,\tbest lgbm's error=0.3522\n", - "[flaml.automl: 12-15 07:43:31] {728} INFO - iteration 20 current learner xgboost\n", - "[flaml.automl: 12-15 07:43:32] {793} INFO - at 48.9s,\tbest xgboost's error=0.3525,\tbest lgbm's error=0.3522\n", - "[flaml.automl: 12-15 07:43:32] {728} INFO - iteration 21 current learner xgboost\n", - "[flaml.automl: 12-15 07:43:37] {793} INFO - at 53.5s,\tbest xgboost's error=0.3525,\tbest lgbm's error=0.3522\n", - "[flaml.automl: 12-15 07:43:37] {728} INFO - iteration 22 current learner lgbm\n", - "[flaml.automl: 12-15 07:43:42] {793} INFO - at 59.0s,\tbest lgbm's error=0.3522,\tbest lgbm's error=0.3522\n", - "[flaml.automl: 12-15 07:43:42] {728} INFO - iteration 23 current learner xgboost\n", - "[flaml.automl: 12-15 07:43:43] {793} INFO - at 59.9s,\tbest xgboost's error=0.3525,\tbest lgbm's error=0.3522\n", - "[flaml.automl: 12-15 07:43:43] {728} INFO - iteration 24 current learner rf\n", - "[flaml.automl: 12-15 07:43:43] {793} INFO - at 59.9s,\tbest rf's error=0.4033,\tbest lgbm's error=0.3522\n", - "[flaml.automl: 12-15 07:43:43] {728} INFO - iteration 25 current learner RGF\n", - "[flaml.automl: 12-15 07:43:47] {793} INFO - at 63.9s,\tbest RGF's error=0.3764,\tbest lgbm's error=0.3522\n", - "[flaml.automl: 12-15 07:43:47] {814} INFO - LGBMClassifier(colsample_bytree=0.7, learning_rate=0.06177098582210786,\n", - " max_bin=127, min_child_weight=5.058775453728698, n_estimators=80,\n", - " num_leaves=17, objective='binary',\n", - " reg_alpha=3.690867311882246e-10, reg_lambda=1.0,\n", - " subsample=0.7382230019481447)\n", - "[flaml.automl: 12-15 07:43:47] {702} INFO - fit succeeded\n" + "[flaml.automl: 01-31 05:28:44] {816} INFO - Evaluation method: holdout\n", + "[flaml.automl: 01-31 05:28:45] {541} INFO - Using StratifiedKFold\n", + "[flaml.automl: 01-31 05:28:45] {837} INFO - Minimizing error metric: 1-accuracy\n", + "[flaml.automl: 01-31 05:28:45] {857} INFO - List of ML learners in AutoML Run: ['RGF', 'lgbm', 'rf', 'xgboost']\n", + "[flaml.automl: 01-31 05:28:45] {916} INFO - iteration 0 current learner RGF\n", + "[flaml.automl: 01-31 05:28:46] {1046} INFO - at 1.2s,\tbest RGF's error=0.3787,\tbest RGF's error=0.3787\n", + "[flaml.automl: 01-31 05:28:46] {916} INFO - iteration 1 current learner RGF\n", + "[flaml.automl: 01-31 05:28:46] {1046} INFO - at 2.1s,\tbest RGF's error=0.3787,\tbest RGF's error=0.3787\n", + "[flaml.automl: 01-31 05:28:46] {916} INFO - iteration 2 current learner lgbm\n", + "[flaml.automl: 01-31 05:28:47] {1046} INFO - at 2.2s,\tbest lgbm's error=0.3777,\tbest lgbm's error=0.3777\n", + "[flaml.automl: 01-31 05:28:47] {916} INFO - iteration 3 current learner lgbm\n", + "[flaml.automl: 01-31 05:28:47] {1046} INFO - at 2.3s,\tbest lgbm's error=0.3777,\tbest lgbm's error=0.3777\n", + "[flaml.automl: 01-31 05:28:47] {916} INFO - iteration 4 current learner RGF\n", + "[flaml.automl: 01-31 05:28:47] {1046} INFO - at 3.1s,\tbest RGF's error=0.3787,\tbest lgbm's error=0.3777\n", + "[flaml.automl: 01-31 05:28:47] {916} INFO - iteration 5 current learner lgbm\n", + "[flaml.automl: 01-31 05:28:47] {1046} INFO - at 3.1s,\tbest lgbm's error=0.3669,\tbest lgbm's error=0.3669\n", + "[flaml.automl: 01-31 05:28:47] {916} INFO - iteration 6 current learner lgbm\n", + "[flaml.automl: 01-31 05:28:48] {1046} INFO - at 3.2s,\tbest lgbm's error=0.3669,\tbest lgbm's error=0.3669\n", + "[flaml.automl: 01-31 05:28:48] {916} INFO - iteration 7 current learner lgbm\n", + "[flaml.automl: 01-31 05:28:48] {1046} INFO - at 3.3s,\tbest lgbm's error=0.3662,\tbest lgbm's error=0.3662\n", + "[flaml.automl: 01-31 05:28:48] {916} INFO - iteration 8 current learner lgbm\n", + "[flaml.automl: 01-31 05:28:48] {1046} INFO - at 3.3s,\tbest lgbm's error=0.3636,\tbest lgbm's error=0.3636\n", + "[flaml.automl: 01-31 05:28:48] {916} INFO - iteration 9 current learner lgbm\n", + "[flaml.automl: 01-31 05:28:48] {1046} INFO - at 3.4s,\tbest lgbm's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:48] {916} INFO - iteration 10 current learner lgbm\n", + "[flaml.automl: 01-31 05:28:48] {1046} INFO - at 3.5s,\tbest lgbm's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:48] {916} INFO - iteration 11 current learner lgbm\n", + "[flaml.automl: 01-31 05:28:48] {1046} INFO - at 3.5s,\tbest lgbm's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:48] {916} INFO - iteration 12 current learner lgbm\n", + "[flaml.automl: 01-31 05:28:48] {1046} INFO - at 3.6s,\tbest lgbm's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:48] {916} INFO - iteration 13 current learner lgbm\n", + "[flaml.automl: 01-31 05:28:48] {1046} INFO - at 3.7s,\tbest lgbm's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:48] {916} INFO - iteration 14 current learner lgbm\n", + "[flaml.automl: 01-31 05:28:48] {1046} INFO - at 3.7s,\tbest lgbm's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:48] {916} INFO - iteration 15 current learner lgbm\n", + "[flaml.automl: 01-31 05:28:48] {1046} INFO - at 3.9s,\tbest lgbm's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:48] {916} INFO - iteration 16 current learner RGF\n", + "[flaml.automl: 01-31 05:28:49] {1046} INFO - at 4.8s,\tbest RGF's error=0.3719,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:49] {916} INFO - iteration 17 current learner lgbm\n", + "[flaml.automl: 01-31 05:28:50] {1046} INFO - at 5.9s,\tbest lgbm's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:50] {916} INFO - iteration 18 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:50] {1046} INFO - at 5.9s,\tbest xgboost's error=0.3768,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:50] {916} INFO - iteration 19 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:50] {1046} INFO - at 6.0s,\tbest xgboost's error=0.3768,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:50] {916} INFO - iteration 20 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:50] {1046} INFO - at 6.0s,\tbest xgboost's error=0.3768,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:50] {916} INFO - iteration 21 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:50] {1046} INFO - at 6.1s,\tbest xgboost's error=0.3768,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:50] {916} INFO - iteration 22 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:51] {1046} INFO - at 6.2s,\tbest xgboost's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:51] {916} INFO - iteration 23 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:51] {1046} INFO - at 6.3s,\tbest xgboost's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:51] {916} INFO - iteration 24 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:51] {1046} INFO - at 6.4s,\tbest xgboost's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:51] {916} INFO - iteration 25 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:51] {1046} INFO - at 6.5s,\tbest xgboost's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:51] {916} INFO - iteration 26 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:51] {1046} INFO - at 6.6s,\tbest xgboost's error=0.3621,\tbest lgbm's error=0.3621\n", + "[flaml.automl: 01-31 05:28:51] {916} INFO - iteration 27 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:51] {1046} INFO - at 6.8s,\tbest xgboost's error=0.3611,\tbest xgboost's error=0.3611\n", + "[flaml.automl: 01-31 05:28:51] {916} INFO - iteration 28 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:51] {1046} INFO - at 7.1s,\tbest xgboost's error=0.3611,\tbest xgboost's error=0.3611\n", + "[flaml.automl: 01-31 05:28:51] {916} INFO - iteration 29 current learner lgbm\n", + "[flaml.automl: 01-31 05:28:52] {1046} INFO - at 7.9s,\tbest lgbm's error=0.3618,\tbest xgboost's error=0.3611\n", + "[flaml.automl: 01-31 05:28:52] {916} INFO - iteration 30 current learner RGF\n", + "[flaml.automl: 01-31 05:28:53] {1046} INFO - at 8.9s,\tbest RGF's error=0.3719,\tbest xgboost's error=0.3611\n", + "[flaml.automl: 01-31 05:28:53] {916} INFO - iteration 31 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:54] {1046} INFO - at 9.3s,\tbest xgboost's error=0.3611,\tbest xgboost's error=0.3611\n", + "[flaml.automl: 01-31 05:28:54] {916} INFO - iteration 32 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:54] {1046} INFO - at 10.0s,\tbest xgboost's error=0.3523,\tbest xgboost's error=0.3523\n", + "[flaml.automl: 01-31 05:28:54] {916} INFO - iteration 33 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:55] {1046} INFO - at 10.6s,\tbest xgboost's error=0.3523,\tbest xgboost's error=0.3523\n", + "[flaml.automl: 01-31 05:28:55] {916} INFO - iteration 34 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:56] {1046} INFO - at 11.5s,\tbest xgboost's error=0.3523,\tbest xgboost's error=0.3523\n", + "[flaml.automl: 01-31 05:28:56] {916} INFO - iteration 35 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:58] {1046} INFO - at 13.2s,\tbest xgboost's error=0.3503,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 01-31 05:28:58] {916} INFO - iteration 36 current learner rf\n", + "[flaml.automl: 01-31 05:28:58] {1046} INFO - at 13.8s,\tbest rf's error=0.4023,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 01-31 05:28:58] {916} INFO - iteration 37 current learner rf\n", + "[flaml.automl: 01-31 05:28:59] {1046} INFO - at 14.2s,\tbest rf's error=0.4011,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 01-31 05:28:59] {916} INFO - iteration 38 current learner xgboost\n", + "[flaml.automl: 01-31 05:28:59] {1046} INFO - at 15.0s,\tbest xgboost's error=0.3503,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 01-31 05:28:59] {916} INFO - iteration 39 current learner xgboost\n", + "[flaml.automl: 01-31 05:29:03] {1046} INFO - at 18.2s,\tbest xgboost's error=0.3503,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 01-31 05:29:03] {916} INFO - iteration 40 current learner xgboost\n", + "[flaml.automl: 01-31 05:29:06] {1046} INFO - at 21.2s,\tbest xgboost's error=0.3503,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 01-31 05:29:06] {916} INFO - iteration 41 current learner xgboost\n", + "[flaml.automl: 01-31 05:29:08] {1046} INFO - at 23.4s,\tbest xgboost's error=0.3503,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 01-31 05:29:08] {916} INFO - iteration 42 current learner RGF\n", + "[flaml.automl: 01-31 05:29:09] {1046} INFO - at 24.5s,\tbest RGF's error=0.3719,\tbest xgboost's error=0.3503\n", + "[flaml.automl: 01-31 05:29:09] {916} INFO - iteration 43 current learner xgboost\n", + "[flaml.automl: 01-31 05:29:18] {1046} INFO - at 33.6s,\tbest xgboost's error=0.3408,\tbest xgboost's error=0.3408\n", + "[flaml.automl: 01-31 05:29:18] {916} INFO - iteration 44 current learner xgboost\n", + "[flaml.automl: 01-31 05:29:32] {1046} INFO - at 47.3s,\tbest xgboost's error=0.3345,\tbest xgboost's error=0.3345\n", + "[flaml.automl: 01-31 05:29:32] {916} INFO - iteration 45 current learner rf\n", + "[flaml.automl: 01-31 05:29:32] {1046} INFO - at 47.7s,\tbest rf's error=0.4011,\tbest xgboost's error=0.3345\n", + "[flaml.automl: 01-31 05:29:32] {916} INFO - iteration 46 current learner RGF\n", + "[flaml.automl: 01-31 05:29:32] {1048} INFO - no enough budget for learner RGF\n", + "[flaml.automl: 01-31 05:29:32] {916} INFO - iteration 47 current learner rf\n", + "[flaml.automl: 01-31 05:29:32] {1048} INFO - no enough budget for learner rf\n", + "[flaml.automl: 01-31 05:29:32] {916} INFO - iteration 48 current learner lgbm\n", + "[flaml.automl: 01-31 05:29:32] {1048} INFO - no enough budget for learner lgbm\n", + "[flaml.automl: 01-31 05:29:32] {1086} INFO - selected model: XGBClassifier(base_score=0.5, booster='gbtree',\n", + " colsample_bylevel=0.9421222097860765, colsample_bynode=1,\n", + " colsample_bytree=0.9986336418953021, gamma=0, gpu_id=-1,\n", + " grow_policy='lossguide', importance_type='gain',\n", + " interaction_constraints=None, learning_rate=0.16476442995703428,\n", + " max_delta_step=0, max_depth=0, max_leaves=85,\n", + " min_child_weight=2.8366848012228014, missing=nan,\n", + " monotone_constraints=None, n_estimators=84, n_jobs=-1,\n", + " num_parallel_tree=1, random_state=0,\n", + " reg_alpha=5.566263839755687e-07, reg_lambda=0.6128658162970646,\n", + " scale_pos_weight=1, subsample=0.978338719375802,\n", + " tree_method='hist', validate_parameters=False, verbosity=0)\n", + "[flaml.automl: 01-31 05:29:32] {871} INFO - fit succeeded\n" ] } ], @@ -611,10 +718,10 @@ "metadata": { "kernelspec": { "name": "python3", - "display_name": "Python 3.7.9 64-bit ('test': conda)", + "display_name": "Python 3.7.7 64-bit ('flaml': conda)", "metadata": { "interpreter": { - "hash": "d432c3c2bcf16c697a4c55907b7ae9cb502fbbf6a7955e813637a3b18956f9d0" + "hash": "bfcd9a6a9254a5e160761a1fd7a9e444f011592c6770d9f4180dde058a9df5dd" } } }, @@ -628,7 +735,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.9-final" + "version": "3.7.7-final" } }, "nbformat": 4, diff --git a/setup.py b/setup.py index b036ee9b8..10bc2d478 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,8 @@ install_requires = [ "xgboost>=0.90", "scipy>=1.4.1", "catboost>=0.23", - "scikit-learn>=0.23", + "scikit-learn>=0.23.2", + "optuna==2.3.0" ], @@ -32,7 +33,7 @@ setuptools.setup( long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/microsoft/FLAML", - packages=["flaml"], + packages=setuptools.find_packages(), install_requires=install_requires, extras_require={ "notebook": [ @@ -45,7 +46,17 @@ setuptools.setup( "flake8>=3.8.4", "pytest>=6.1.1", "coverage>=5.3", + "xgboost<1.3", "rgf-python", + # "hpbandster", + # "torchvision" + ], + "ray": [ + "ray[tune]==1.1.0", + "pyyaml<5.3.1", + ], + "azureml": [ + "azureml-mlflow" ], }, classifiers=[ diff --git a/test/test_automl.py b/test/test_automl.py index fa68b38f2..d502cb056 100644 --- a/test/test_automl.py +++ b/test/test_automl.py @@ -7,58 +7,74 @@ from sklearn.datasets import load_boston, load_iris, load_wine from flaml import AutoML from flaml.data import get_output_from_log -from flaml.model import BaseEstimator -from flaml.space import ConfigSearchInfo +from flaml.model import SKLearnEstimator from rgf.sklearn import RGFClassifier, RGFRegressor +from flaml import tune -class MyRegularizedGreedyForest(BaseEstimator): +class MyRegularizedGreedyForest(SKLearnEstimator): - # search space - params_configsearch_info = { - 'max_leaf': ConfigSearchInfo(name = 'max_leaf', - type = int, lower = 4, init = 4, upper = 10000), - 'n_iter': ConfigSearchInfo(name = 'n_iter', type = int, lower = 1, - init = 1, upper = 32768), - 'n_tree_search': ConfigSearchInfo(name = 'n_tree_search', type = int, - lower = 1, init = 1, upper = 32768), - 'opt_interval': ConfigSearchInfo(name = 'opt_interval', type = int, - lower = 1, init = 100, upper = 10000), - 'learning_rate': ConfigSearchInfo(name = 'learning_rate', type = float, - lower = 0.01, init = 1.0, upper = 20.0), - 'min_samples_leaf': ConfigSearchInfo(name = 'min_samples_leaf', - type = int, lower = 1, init = 20, upper = 20) - } - - def __init__(self, objective_name = 'binary:logistic', n_jobs = 1, - max_leaf = 1000, n_iter = 1, n_tree_search = 1, opt_interval = 1, - learning_rate = 1.0, min_samples_leaf = 1): - self.objective_name = objective_name + def __init__(self, task = 'binary:logistic', n_jobs = 1, max_leaf = 4, + n_iter = 1, n_tree_search = 1, opt_interval = 1, learning_rate = 1.0, + min_samples_leaf = 1, **params): - if 'regression' in objective_name: + super().__init__(task, **params) + + if 'regression' in task: self.estimator_class = RGFRegressor else: self.estimator_class = RGFClassifier # round integer hyperparameters self.params = { + "n_jobs": n_jobs, 'max_leaf': int(round(max_leaf)), 'n_iter': int(round(n_iter)), 'n_tree_search': int(round(n_tree_search)), 'opt_interval': int(round(opt_interval)), 'learning_rate': learning_rate, - 'min_samples_leaf':int(round(min_samples_leaf)), - "n_jobs": n_jobs, - } + 'min_samples_leaf':int(round(min_samples_leaf)) + } + + @classmethod + def search_space(cls, data_size, task): + space = { + 'max_leaf': {'domain': tune.qloguniform( + lower = 4, upper = data_size, q = 1), 'init_value': 4}, + 'n_iter': {'domain': tune.qloguniform( + lower = 1, upper = data_size, q = 1), 'init_value': 1}, + 'n_tree_search': {'domain': tune.qloguniform( + lower = 1, upper = 32768, q = 1), 'init_value': 1}, + 'opt_interval': {'domain': tune.qloguniform( + lower = 1, upper = 10000, q = 1), 'init_value': 100}, + 'learning_rate': {'domain': tune.loguniform( + lower = 0.01, upper = 20.0)}, + 'min_samples_leaf': {'domain': tune.qloguniform( + lower = 1, upper = 20, q = 1), 'init_value': 20}, + } + return space + + @classmethod + def size(cls, config): + max_leaves = int(round(config['max_leaf'])) + n_estimators = int(round(config['n_iter'])) + return (max_leaves*3 + (max_leaves-1)*4 + 1.0)*n_estimators*8 + + @classmethod + def cost_relative2lgbm(cls): + return 1.0 -def custom_metric(X_test, y_test, estimator, labels, X_train, y_train): +def custom_metric(X_test, y_test, estimator, labels, X_train, y_train, + weight_test=None, weight_train=None): from sklearn.metrics import log_loss y_pred = estimator.predict_proba(X_test) - test_loss = log_loss(y_test, y_pred, labels=labels) + test_loss = log_loss(y_test, y_pred, labels=labels, + sample_weight=weight_test) y_pred = estimator.predict_proba(X_train) - train_loss = log_loss(y_train, y_pred, labels=labels) + train_loss = log_loss(y_train, y_pred, labels=labels, + sample_weight=weight_train) alpha = 0.5 return test_loss * (1 + alpha) - alpha * train_loss, [test_loss, train_loss] @@ -77,6 +93,27 @@ class TestAutoML(unittest.TestCase): "sample": True, # whether to subsample training data "log_file_name": "test/wine.log", "log_training_metric": True, # whether to log training metric + "n_jobs": 1, + } + + '''The main flaml automl API''' + automl.fit(X_train = X_train, y_train = y_train, **settings) + + def test_ensemble(self): + automl = AutoML() + automl.add_learner(learner_name = 'RGF', + learner_class = MyRegularizedGreedyForest) + X_train, y_train = load_wine(return_X_y=True) + settings = { + "time_budget": 10, # total running time in seconds + # "estimator_list": ['lgbm', 'xgboost'], + "estimator_list": ['RGF', 'lgbm', 'rf', 'xgboost'], + "task": 'classification', # task type + "sample": True, # whether to subsample training data + "log_file_name": "test/wine.log", + "log_training_metric": True, # whether to log training metric + "ensemble": True, + "n_jobs": 1, } '''The main flaml automl API''' @@ -87,6 +124,7 @@ class TestAutoML(unittest.TestCase): def test_custom_metric(self): + X_train, y_train = load_iris(return_X_y=True) automl_experiment = AutoML() automl_settings = { "time_budget": 10, @@ -96,9 +134,10 @@ class TestAutoML(unittest.TestCase): "log_file_name": "test/iris_custom.log", "log_training_metric": True, 'log_type': 'all', - "model_history": True + "n_jobs": 1, + "model_history": True, + "sample_weight": np.ones(len(y_train)), } - X_train, y_train = load_iris(return_X_y=True) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.classes_) @@ -111,7 +150,7 @@ class TestAutoML(unittest.TestCase): automl_experiment = AutoML() estimator = automl_experiment.get_estimator_from_log( automl_settings["log_file_name"], record_id=0, - objective='multi') + task='multi') print(estimator) time_history, best_valid_loss_history, valid_loss_history, \ config_history, train_loss_history = get_output_from_log( @@ -127,6 +166,7 @@ class TestAutoML(unittest.TestCase): "task": 'classification', "log_file_name": "test/iris.log", "log_training_metric": True, + "n_jobs": 1, "model_history": True } X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame) @@ -160,6 +200,7 @@ class TestAutoML(unittest.TestCase): "task": 'regression', "log_file_name": "test/boston.log", "log_training_metric": True, + "n_jobs": 1, "model_history": True } X_train, y_train = load_boston(return_X_y=True) @@ -167,7 +208,7 @@ class TestAutoML(unittest.TestCase): automl_experiment.fit(X_train=X_train[:n], y_train=y_train[:n], X_val=X_train[n:], y_val=y_train[n:], **automl_settings) - assert automl_experiment.eval_method == 'holdout' + assert automl_experiment._state.eval_method == 'holdout' print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) @@ -185,6 +226,7 @@ class TestAutoML(unittest.TestCase): "task": 'classification', "log_file_name": "test/sparse_classification.log", "split_type": "uniform", + "n_jobs": 1, "model_history": True } X_train = scipy.sparse.random(1554, 21, dtype=int) @@ -207,6 +249,7 @@ class TestAutoML(unittest.TestCase): "metric": 'mae', "task": 'regression', "log_file_name": "test/sparse_regression.log", + "n_jobs": 1, "model_history": True } X_train = scipy.sparse.random(300, 900, density=0.0001) @@ -216,7 +259,7 @@ class TestAutoML(unittest.TestCase): automl_experiment.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings) - assert automl_experiment.X_val.shape == X_val.shape + assert automl_experiment._state.X_val.shape == X_val.shape print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) @@ -237,6 +280,7 @@ class TestAutoML(unittest.TestCase): "log_file_name": "test/sparse_classification.log", "estimator_list": ["xgboost"], "log_type": "all", + "n_jobs": 1, } X_train = scipy.sparse.eye(900000) y_train = np.random.randint(2, size=900000) @@ -259,6 +303,7 @@ class TestAutoML(unittest.TestCase): "log_file_name": "test/sparse_classification.log", "estimator_list": ["lrl1", "lrl2"], "log_type": "all", + "n_jobs": 1, } X_train = scipy.sparse.random(3000, 900, density=0.1) y_train = np.random.randint(2, size=3000) @@ -279,6 +324,7 @@ class TestAutoML(unittest.TestCase): 'eval_method': 'cv', "task": 'regression', "log_file_name": "test/sparse_regression.log", + "n_jobs": 1, "model_history": True } X_train = scipy.sparse.random(100, 100) diff --git a/test/test_python_log.py b/test/test_python_log.py index d1cb2d347..30a1b6d54 100644 --- a/test/test_python_log.py +++ b/test/test_python_log.py @@ -28,11 +28,12 @@ class TestLogging(unittest.TestCase): # Run a simple job. automl_experiment = AutoML() automl_settings = { - "time_budget": 2, + "time_budget": 1, "metric": 'mse', "task": 'regression', "log_file_name": training_log, "log_training_metric": True, + "n_jobs": 1, "model_history": True } X_train, y_train = load_boston(return_X_y=True) diff --git a/test/test_pytorch_cifar10.py b/test/test_pytorch_cifar10.py new file mode 100644 index 000000000..a7460594e --- /dev/null +++ b/test/test_pytorch_cifar10.py @@ -0,0 +1,351 @@ +import unittest +import os +import time + +import logging +logger = logging.getLogger(__name__) +logger.addHandler(logging.FileHandler('test/tune_pytorch_cifar10.log')) + + +# __load_data_begin__ +def load_data(data_dir="./data"): + transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) + ]) + + trainset = torchvision.datasets.CIFAR10( + root=data_dir, train=True, download=True, transform=transform) + + testset = torchvision.datasets.CIFAR10( + root=data_dir, train=False, download=True, transform=transform) + + return trainset, testset +# __load_data_end__ + + +import numpy as np +try: + import torch + import torch.nn as nn + import torch.nn.functional as F + import torch.optim as optim + from torch.utils.data import random_split + import torchvision + import torchvision.transforms as transforms + + + # __net_begin__ + class Net(nn.Module): + def __init__(self, l1=120, l2=84): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, l1) + self.fc2 = nn.Linear(l1, l2) + self.fc3 = nn.Linear(l2, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + # __net_end__ +except ImportError: + print("skip test_pytorch because torchvision cannot be imported.") + + +# __load_data_begin__ +def load_data(data_dir="test/data"): + transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) + ]) + + trainset = torchvision.datasets.CIFAR10( + root=data_dir, train=True, download=True, transform=transform) + + testset = torchvision.datasets.CIFAR10( + root=data_dir, train=False, download=True, transform=transform) + + return trainset, testset +# __load_data_end__ + + +# __train_begin__ +def train_cifar(config, checkpoint_dir=None, data_dir=None): + if not "l1" in config: + logger.warning(config) + net = Net(2 ** config["l1"], 2 ** config["l2"]) + + device = "cpu" + if torch.cuda.is_available(): + device = "cuda:0" + if torch.cuda.device_count() > 1: + net = nn.DataParallel(net) + net.to(device) + + criterion = nn.CrossEntropyLoss() + optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) + + # The `checkpoint_dir` parameter gets passed by Ray Tune when a checkpoint + # should be restored. + if checkpoint_dir: + checkpoint = os.path.join(checkpoint_dir, "checkpoint") + model_state, optimizer_state = torch.load(checkpoint) + net.load_state_dict(model_state) + optimizer.load_state_dict(optimizer_state) + + trainset, testset = load_data(data_dir) + + test_abs = int(len(trainset) * 0.8) + train_subset, val_subset = random_split( + trainset, [test_abs, len(trainset) - test_abs]) + + trainloader = torch.utils.data.DataLoader( + train_subset, + batch_size=int(2**config["batch_size"]), + shuffle=True, + num_workers=4) + valloader = torch.utils.data.DataLoader( + val_subset, + batch_size=int(2**config["batch_size"]), + shuffle=True, + num_workers=4) + + from ray import tune + + for epoch in range(int(round(config["num_epochs"]))): # loop over the dataset multiple times + running_loss = 0.0 + epoch_steps = 0 + for i, data in enumerate(trainloader, 0): + # get the inputs; data is a list of [inputs, labels] + inputs, labels = data + inputs, labels = inputs.to(device), labels.to(device) + + # zero the parameter gradients + optimizer.zero_grad() + + # forward + backward + optimize + outputs = net(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + # print statistics + running_loss += loss.item() + epoch_steps += 1 + if i % 2000 == 1999: # print every 2000 mini-batches + print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, + running_loss / epoch_steps)) + running_loss = 0.0 + + # Validation loss + val_loss = 0.0 + val_steps = 0 + total = 0 + correct = 0 + for i, data in enumerate(valloader, 0): + with torch.no_grad(): + inputs, labels = data + inputs, labels = inputs.to(device), labels.to(device) + + outputs = net(inputs) + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + + loss = criterion(outputs, labels) + val_loss += loss.cpu().numpy() + val_steps += 1 + + # Here we save a checkpoint. It is automatically registered with + # Ray Tune and will potentially be passed as the `checkpoint_dir` + # parameter in future iterations. + with tune.checkpoint_dir(step=epoch) as checkpoint_dir: + path = os.path.join(checkpoint_dir, "checkpoint") + torch.save( + (net.state_dict(), optimizer.state_dict()), path) + + tune.report(loss=(val_loss / val_steps), accuracy=correct / total) + print("Finished Training") +# __train_end__ + + +# __test_acc_begin__ +def _test_accuracy(net, device="cpu"): + trainset, testset = load_data() + + testloader = torch.utils.data.DataLoader( + testset, batch_size=4, shuffle=False, num_workers=2) + + correct = 0 + total = 0 + with torch.no_grad(): + for data in testloader: + images, labels = data + images, labels = images.to(device), labels.to(device) + outputs = net(images) + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + + return correct / total +# __test_acc_end__ + + +# __main_begin__ +def cifar10_main(method='BlendSearch', num_samples=10, max_num_epochs=100, + gpus_per_trial=2): + data_dir = os.path.abspath("test/data") + load_data(data_dir) # Download data for all trials before starting the run + if method == 'BlendSearch': + from flaml import tune + else: + from ray import tune + if method in ['BlendSearch', 'BOHB', 'Optuna']: + config = { + "l1": tune.randint(2, 8), + "l2": tune.randint(2, 8), + "lr": tune.loguniform(1e-4, 1e-1), + "num_epochs": tune.qloguniform(1, max_num_epochs, q=1), + "batch_size": tune.randint(1, 4)#tune.choice([2, 4, 8, 16]) + } + else: + config = { + "l1": tune.randint(2, 9), + "l2": tune.randint(2, 9), + "lr": tune.loguniform(1e-4, 1e-1), + "num_epochs": tune.qloguniform(1, max_num_epochs+1, q=1), + "batch_size": tune.randint(1, 5)#tune.choice([2, 4, 8, 16]) + } + import ray + time_budget_s = 3600 + start_time = time.time() + if method == 'BlendSearch': + result = tune.run( + ray.tune.with_parameters(train_cifar, data_dir=data_dir), + init_config={ + "l1": 2, + "l2": 2, + "num_epochs": 1, + "batch_size": 4, + }, + metric="loss", + mode="min", + max_resource=max_num_epochs, + min_resource=1, + report_intermediate_result=True, + resources_per_trial={"cpu": 2, "gpu": gpus_per_trial}, + config=config, + local_dir='logs/', + num_samples=num_samples, + time_budget_s=time_budget_s, + use_ray=True) + else: + if 'ASHA' == method: + algo = None + elif 'BOHB' == method: + from ray.tune.schedulers import HyperBandForBOHB + from ray.tune.suggest.bohb import TuneBOHB + algo = TuneBOHB() + scheduler = HyperBandForBOHB(max_t=max_num_epochs) + elif 'Optuna' == method: + from ray.tune.suggest.optuna import OptunaSearch + algo = OptunaSearch() + elif 'CFO' == method: + from flaml import CFO + algo = CFO(points_to_evaluate=[{ + "l1": 2, + "l2": 2, + "num_epochs": 1, + "batch_size": 4, + }]) + elif 'Nevergrad' == method: + from ray.tune.suggest.nevergrad import NevergradSearch + import nevergrad as ng + algo = NevergradSearch(optimizer=ng.optimizers.OnePlusOne) + if method != 'BOHB': + from ray.tune.schedulers import ASHAScheduler + scheduler = ASHAScheduler( + max_t=max_num_epochs, + grace_period=1) + result = tune.run( + tune.with_parameters(train_cifar, data_dir=data_dir), + resources_per_trial={"cpu": 2, "gpu": gpus_per_trial}, + config=config, + metric="loss", + mode="min", + num_samples=num_samples, time_budget_s=time_budget_s, + scheduler=scheduler, search_alg=algo + ) + ray.shutdown() + + logger.info(f"method={method}") + logger.info(f"n_samples={num_samples}") + logger.info(f"time={time.time()-start_time}") + best_trial = result.get_best_trial("loss", "min", "all") + logger.info("Best trial config: {}".format(best_trial.config)) + logger.info("Best trial final validation loss: {}".format( + best_trial.metric_analysis["loss"]["min"])) + logger.info("Best trial final validation accuracy: {}".format( + best_trial.metric_analysis["accuracy"]["max"])) + + best_trained_model = Net(2**best_trial.config["l1"], + 2**best_trial.config["l2"]) + device = "cpu" + if torch.cuda.is_available(): + device = "cuda:0" + if gpus_per_trial > 1: + best_trained_model = nn.DataParallel(best_trained_model) + best_trained_model.to(device) + + checkpoint_path = os.path.join(best_trial.checkpoint.value, "checkpoint") + + model_state, optimizer_state = torch.load(checkpoint_path) + best_trained_model.load_state_dict(model_state) + + test_acc = _test_accuracy(best_trained_model, device) + logger.info("Best trial test set accuracy: {}".format(test_acc)) +# __main_end__ + + +gpus_per_trial=0#.5 +num_samples=500 + + +def _test_cifar10_bs(): + cifar10_main(num_samples=num_samples, gpus_per_trial=gpus_per_trial) + + +def _test_cifar10_cfo(): + cifar10_main('CFO', + num_samples=num_samples, gpus_per_trial=gpus_per_trial) + + +def _test_cifar10_optuna(): + cifar10_main('Optuna', + num_samples=num_samples, gpus_per_trial=gpus_per_trial) + + +def _test_cifar10_asha(): + cifar10_main('ASHA', + num_samples=num_samples, gpus_per_trial=gpus_per_trial) + + +def _test_cifar10_bohb(): + cifar10_main('BOHB', + num_samples=num_samples, gpus_per_trial=gpus_per_trial) + + +def _test_cifar10_nevergrad(): + cifar10_main('Nevergrad', + num_samples=num_samples, gpus_per_trial=gpus_per_trial) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_training_log.py b/test/test_training_log.py index 2b72ecd82..18f897b6a 100644 --- a/test/test_training_log.py +++ b/test/test_training_log.py @@ -23,6 +23,8 @@ class TestTrainingLog(unittest.TestCase): "task": 'regression', "log_file_name": filename, "log_training_metric": True, + "mem_thres": 1024*1024, + "n_jobs": 1, "model_history": True } X_train, y_train = load_boston(return_X_y=True) diff --git a/test/test_tune.py b/test/test_tune.py new file mode 100644 index 000000000..e151b7dae --- /dev/null +++ b/test/test_tune.py @@ -0,0 +1,200 @@ +import unittest +import os +import time +from sklearn.model_selection import train_test_split +import sklearn.metrics +import sklearn.datasets +try: + from ray.tune.integration.xgboost import TuneReportCheckpointCallback +except ImportError: + print("skip test_tune because ray tune cannot be imported.") +import xgboost as xgb + +import logging +logger = logging.getLogger(__name__) +logger.addHandler(logging.FileHandler('test/tune_xgboost.log')) + + +def train_breast_cancer(config: dict): + # This is a simple training function to be passed into Tune + # Load dataset + data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True) + # Split into train and test set + train_x, test_x, train_y, test_y = train_test_split( + data, labels, test_size=0.25) + # Build input matrices for XGBoost + train_set = xgb.DMatrix(train_x, label=train_y) + test_set = xgb.DMatrix(test_x, label=test_y) + # HyperOpt returns a tuple + config = config.copy() + config["eval_metric"] = ["logloss", "error"] + config["objective"] = "binary:logistic" + # Train the classifier, using the Tune callback + xgb.train( + config, + train_set, + evals=[(test_set, "eval")], + verbose_eval=False, + callbacks=[TuneReportCheckpointCallback(filename="model.xgb")]) + + +def _test_xgboost(method='BlendSearch'): + try: + import ray + except ImportError: + return + if method == 'BlendSearch': + from flaml import tune + else: + from ray import tune + search_space = { + # You can mix constants with search space objects. + "max_depth": tune.randint(1, 8) if method in [ + "BlendSearch", "BOHB", "Optuna"] else tune.randint(1, 9), + "min_child_weight": tune.choice([1, 2, 3]), + "subsample": tune.uniform(0.5, 1.0), + "eta": tune.loguniform(1e-4, 1e-1) + } + max_iter = 10 + for num_samples in [256]: + time_budget_s = None + for n_cpu in [8]: + start_time = time.time() + ray.init(num_cpus=n_cpu, num_gpus=0) + if method == 'BlendSearch': + analysis = tune.run( + train_breast_cancer, + init_config={ + "max_depth": 1, + "min_child_weight": 3, + }, + cat_hp_cost={ + "min_child_weight": [6, 3, 2], + }, + metric="eval-logloss", + mode="min", + max_resource=max_iter, + min_resource=1, + report_intermediate_result=True, + # You can add "gpu": 0.1 to allocate GPUs + resources_per_trial={"cpu": 1}, + config=search_space, + local_dir='logs/', + num_samples=num_samples*n_cpu, + time_budget_s=time_budget_s, + use_ray=True) + else: + if 'ASHA' == method: + algo = None + elif 'BOHB' == method: + from ray.tune.schedulers import HyperBandForBOHB + from ray.tune.suggest.bohb import TuneBOHB + algo = TuneBOHB(max_concurrent=n_cpu) + scheduler = HyperBandForBOHB(max_t=max_iter) + elif 'Optuna' == method: + from ray.tune.suggest.optuna import OptunaSearch + algo = OptunaSearch() + elif 'CFO' == method: + from flaml import CFO + algo = CFO(points_to_evaluate=[{ + "max_depth": 1, + "min_child_weight": 3, + }], cat_hp_cost={ + "min_child_weight": [6, 3, 2], + }) + elif 'Dragonfly' == method: + from ray.tune.suggest.dragonfly import DragonflySearch + algo = DragonflySearch() + elif 'SkOpt' == method: + from ray.tune.suggest.skopt import SkOptSearch + algo = SkOptSearch() + elif 'Nevergrad' == method: + from ray.tune.suggest.nevergrad import NevergradSearch + import nevergrad as ng + algo = NevergradSearch(optimizer=ng.optimizers.OnePlusOne) + elif 'ZOOpt' == method: + from ray.tune.suggest.zoopt import ZOOptSearch + algo = ZOOptSearch(budget=num_samples*n_cpu) + elif 'Ax' == method: + from ray.tune.suggest.ax import AxSearch + algo = AxSearch() + elif 'HyperOpt' == method: + from ray.tune.suggest.hyperopt import HyperOptSearch + algo = HyperOptSearch() + scheduler = None + if method != 'BOHB': + from ray.tune.schedulers import ASHAScheduler + scheduler = ASHAScheduler( + max_t=max_iter, + grace_period=1) + analysis = tune.run( + train_breast_cancer, + metric="eval-logloss", + mode="min", + # You can add "gpu": 0.1 to allocate GPUs + resources_per_trial={"cpu": 1}, + config=search_space, local_dir='logs/', + num_samples=num_samples*n_cpu, time_budget_s=time_budget_s, + scheduler=scheduler, search_alg=algo) + ray.shutdown() + # # Load the best model checkpoint + # best_bst = xgb.Booster() + # best_bst.load_model(os.path.join(analysis.best_checkpoint, + # "model.xgb")) + best_trial = analysis.get_best_trial("eval-logloss","min","all") + accuracy = 1. - best_trial.metric_analysis["eval-error"]["min"] + logloss = best_trial.metric_analysis["eval-logloss"]["min"] + logger.info(f"method={method}") + logger.info(f"n_samples={num_samples*n_cpu}") + logger.info(f"time={time.time()-start_time}") + logger.info(f"Best model eval loss: {logloss:.4f}") + logger.info(f"Best model total accuracy: {accuracy:.4f}") + logger.info(f"Best model parameters: {best_trial.config}") + + +def test_xgboost_bs(): + _test_xgboost() + + +def test_xgboost_cfo(): + _test_xgboost('CFO') + + +def _test_xgboost_dragonfly(): + _test_xgboost('Dragonfly') + + +def _test_xgboost_skopt(): + _test_xgboost('SkOpt') + + +def _test_xgboost_nevergrad(): + _test_xgboost('Nevergrad') + + +def _test_xgboost_zoopt(): + _test_xgboost('ZOOpt') + + +def _test_xgboost_ax(): + _test_xgboost('Ax') + + +def __test_xgboost_hyperopt(): + _test_xgboost('HyperOpt') + + +def _test_xgboost_optuna(): + _test_xgboost('Optuna') + + +def _test_xgboost_asha(): + _test_xgboost('ASHA') + + +def _test_xgboost_bohb(): + _test_xgboost('BOHB') + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_xgboost2d.py b/test/test_xgboost2d.py new file mode 100644 index 000000000..d85a3663d --- /dev/null +++ b/test/test_xgboost2d.py @@ -0,0 +1,69 @@ +import unittest + +from sklearn.datasets import fetch_openml +from sklearn.model_selection import train_test_split +import numpy as np +from flaml.automl import AutoML +from flaml.model import XGBoostSklearnEstimator +from flaml import tune + + +# dataset = "blood-transfusion-service-center" +# dataset = "Australian" +dataset = "credit-g" +# dataset = "phoneme" +# dataset = "kc1" + + +class XGBoost2D(XGBoostSklearnEstimator): + + @classmethod + def search_space(cls, data_size, task): + upper = min(32768,int(data_size)) + return { + 'n_estimators': { + 'domain': tune.qloguniform(lower=4, upper=upper, q=1), + 'init_value': 4, + }, + 'max_leaves': { + 'domain': tune.qloguniform(lower=4, upper=upper, q=1), + 'init_value': 4, + }, + } + + +def test_simple(method=None): + automl = AutoML() + automl.add_learner(learner_name = 'XGBoost2D', + learner_class = XGBoost2D) + + automl_settings = { + "estimator_list": ['XGBoost2D'], + # "metric": 'accuracy', + "task": 'classification', + "log_file_name": f"test/xgboost2d_{dataset}_{method}.log", + # "model_history": True, + # "log_training_metric": True, + # "split_type": split_type, + "n_jobs": 1, + "hpo_method": method, + "log_type": "all", + "time_budget": 3#6000, + } + + X, y = fetch_openml(name=dataset, return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, + random_state=42) + automl.fit(X_train=X_train, y_train=y_train, **automl_settings) + + +def _test_optuna(): + test_simple(method="optuna") + + +def test_grid(): + test_simple(method="grid") + + +if __name__ == "__main__": + unittest.main()