mirror of
https://github.com/microsoft/autogen.git
synced 2026-04-20 03:02:16 -04:00
Add ChaCha (#92)
* pickle the AutoML object * get best model per estimator * test deberta * stateless API * pickle the AutoML object * get best model per estimator * test deberta * stateless API * prevent divide by zero * test roberta * BlendSearchTuner * sync * version number * update gitignore * delta time * reindex columns when dropping int-indexed columns * add seed * add seed in Args * merge * init upload of ChaCha * remove redundancy * add back catboost * improve AutoVW API * set min_resource_lease in VWOnlineTrial * docstr * rename * docstr * add docstr * improve API and documentation * fix name * docstr * naming * remove max_resource in scheduler * add TODO in flow2 * remove redundancy in rearcher * add input type * adapt code from ray.tune * move files * naming * documentation * fix import error * fix format issues * remove cb in worse than test * improve _generate_all_comb * remove ray tune * naming * VowpalWabbitTrial * import error * import error * merge test code * scheduler import * fix import * remove * import, minor bug and version * Float or Categorical * fix default * add test_autovw.py * add vowpalwabbit and openml * lint * reorg * lint * indent * add autovw notebook * update notebook * update log msg and autovw notebook * update autovw notebook * update autovw notebook * add available strings for model_select_policy * string for metric * Update vw format in flaml/onlineml/trial.py Co-authored-by: olgavrou <olgavrou@gmail.com> * make init_config optional * add _setup_trial_runner and update notebook * space Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com> Co-authored-by: Chi Wang <wang.chi@microsoft.com> Co-authored-by: Qingyun Wu <qiw@microsoft.com> Co-authored-by: olgavrou <olgavrou@gmail.com>
This commit is contained in:
372
test/test_autovw.py
Normal file
372
test/test_autovw.py
Normal file
@@ -0,0 +1,372 @@
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
||||
import time
|
||||
import logging
|
||||
from flaml.tune import loguniform, polynomial_expansion_set
|
||||
from vowpalwabbit import pyvw
|
||||
from flaml import AutoVW
|
||||
import string
|
||||
import os
|
||||
import openml
|
||||
VW_DS_DIR = 'test/data/'
|
||||
NS_LIST = list(string.ascii_lowercase) + list(string.ascii_uppercase)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def oml_to_vw_w_grouping(X, y, ds_dir, fname, orginal_dim, group_num,
|
||||
grouping_method='sequential'):
|
||||
# split all_indexes into # group_num of groups
|
||||
max_size_per_group = int(np.ceil(orginal_dim / float(group_num)))
|
||||
# sequential grouping
|
||||
if grouping_method == 'sequential':
|
||||
group_indexes = [] # lists of lists
|
||||
for i in range(group_num):
|
||||
indexes = [ind for ind in range(i * max_size_per_group,
|
||||
min((i + 1) * max_size_per_group, orginal_dim))]
|
||||
if len(indexes) > 0:
|
||||
group_indexes.append(indexes)
|
||||
print(group_indexes)
|
||||
else:
|
||||
NotImplementedError
|
||||
if group_indexes:
|
||||
if not os.path.exists(ds_dir):
|
||||
os.makedirs(ds_dir)
|
||||
with open(os.path.join(ds_dir, fname), 'w') as f:
|
||||
if isinstance(X, pd.DataFrame):
|
||||
raise NotImplementedError
|
||||
elif isinstance(X, np.ndarray):
|
||||
for i in range(len(X)):
|
||||
NS_content = []
|
||||
for zz in range(len(group_indexes)):
|
||||
ns_features = ' '.join('{}:{:.6f}'.format(ind, X[i][ind]
|
||||
) for ind in group_indexes[zz])
|
||||
NS_content.append(ns_features)
|
||||
ns_line = '{} |{}'.format(str(y[i]), '|'.join(
|
||||
'{} {}'.format(NS_LIST[j], NS_content[j]
|
||||
) for j in range(len(group_indexes))))
|
||||
f.write(ns_line)
|
||||
f.write('\n')
|
||||
elif isinstance(X, scipy.sparse.csr_matrix):
|
||||
print('NotImplementedError for sparse data')
|
||||
NotImplementedError
|
||||
|
||||
|
||||
def save_vw_dataset_w_ns(X, y, did, ds_dir, max_ns_num, is_regression):
|
||||
""" convert openml dataset to vw example and save to file
|
||||
"""
|
||||
print('is_regression', is_regression)
|
||||
if is_regression:
|
||||
fname = 'ds_{}_{}_{}.vw'.format(did, max_ns_num, 0)
|
||||
print('dataset size', X.shape[0], X.shape[1])
|
||||
print('saving data', did, ds_dir, fname)
|
||||
dim = X.shape[1]
|
||||
oml_to_vw_w_grouping(X, y, ds_dir, fname, dim, group_num=max_ns_num)
|
||||
else:
|
||||
NotImplementedError
|
||||
|
||||
|
||||
def shuffle_data(X, y, seed):
|
||||
try:
|
||||
n = len(X)
|
||||
except ValueError:
|
||||
n = X.getnnz()
|
||||
|
||||
perm = np.random.RandomState(seed=seed).permutation(n)
|
||||
X_shuf = X[perm, :]
|
||||
y_shuf = y[perm]
|
||||
return X_shuf, y_shuf
|
||||
|
||||
|
||||
def get_oml_to_vw(did, max_ns_num, ds_dir=VW_DS_DIR):
|
||||
success = False
|
||||
print('-----getting oml dataset-------', did)
|
||||
ds = openml.datasets.get_dataset(did)
|
||||
target_attribute = ds.default_target_attribute
|
||||
# if target_attribute is None and did in OML_target_attribute_dict:
|
||||
# target_attribute = OML_target_attribute_dict[did]
|
||||
|
||||
print('target=ds.default_target_attribute', target_attribute)
|
||||
data = ds.get_data(target=target_attribute, dataset_format='array')
|
||||
X, y = data[0], data[1] # return X: pd DataFrame, y: pd series
|
||||
import scipy
|
||||
if scipy.sparse.issparse(X):
|
||||
X = scipy.sparse.csr_matrix.toarray(X)
|
||||
print('is sparse matrix')
|
||||
if data and isinstance(X, np.ndarray):
|
||||
print('-----converting oml to vw and and saving oml dataset-------')
|
||||
save_vw_dataset_w_ns(X, y, did, ds_dir, max_ns_num, is_regression=True)
|
||||
success = True
|
||||
else:
|
||||
print('---failed to convert/save oml dataset to vw!!!----')
|
||||
try:
|
||||
X, y = data[0], data[1] # return X: pd DataFrame, y: pd series
|
||||
if data and isinstance(X, np.ndarray):
|
||||
print('-----converting oml to vw and and saving oml dataset-------')
|
||||
save_vw_dataset_w_ns(X, y, did, ds_dir, max_ns_num, is_regression=True)
|
||||
success = True
|
||||
else:
|
||||
print('---failed to convert/save oml dataset to vw!!!----')
|
||||
except ValueError:
|
||||
print('-------------failed to get oml dataset!!!', did)
|
||||
return success
|
||||
|
||||
|
||||
def load_vw_dataset(did, ds_dir, is_regression, max_ns_num):
|
||||
import os
|
||||
if is_regression:
|
||||
# the second field specifies the largest number of namespaces using.
|
||||
fname = 'ds_{}_{}_{}.vw'.format(did, max_ns_num, 0)
|
||||
vw_dataset_file = os.path.join(ds_dir, fname)
|
||||
# if file does not exist, generate and save the datasets
|
||||
if not os.path.exists(vw_dataset_file) or os.stat(vw_dataset_file).st_size < 1000:
|
||||
get_oml_to_vw(did, max_ns_num)
|
||||
print(ds_dir, vw_dataset_file)
|
||||
if not os.path.exists(ds_dir):
|
||||
os.makedirs(ds_dir)
|
||||
with open(os.path.join(ds_dir, fname), 'r') as f:
|
||||
vw_content = f.read().splitlines()
|
||||
print(type(vw_content), len(vw_content))
|
||||
return vw_content
|
||||
|
||||
|
||||
def get_data(iter_num=None, dataset_id=None, vw_format=True,
|
||||
max_ns_num=10, shuffle=False, use_log=True, dataset_type='regression'):
|
||||
logging.info('generating data')
|
||||
LOG_TRANSFORMATION_THRESHOLD = 100
|
||||
# get data from simulation
|
||||
import random
|
||||
vw_examples = None
|
||||
data_id = int(dataset_id)
|
||||
# loading oml dataset
|
||||
# data = OpenML2VWData(data_id, max_ns_num, dataset_type)
|
||||
# Y = data.Y
|
||||
if vw_format:
|
||||
# vw_examples = data.vw_examples
|
||||
vw_examples = load_vw_dataset(did=data_id, ds_dir=VW_DS_DIR, is_regression=True,
|
||||
max_ns_num=max_ns_num)
|
||||
Y = []
|
||||
for i, e in enumerate(vw_examples):
|
||||
Y.append(float(e.split('|')[0]))
|
||||
logger.debug('first data %s', vw_examples[0])
|
||||
# do data shuffling or log transformation for oml data when needed
|
||||
if shuffle:
|
||||
random.seed(54321)
|
||||
random.shuffle(vw_examples)
|
||||
|
||||
# do log transformation
|
||||
unique_y = set(Y)
|
||||
min_y = min(unique_y)
|
||||
max_y = max(unique_y)
|
||||
if use_log and max((max_y - min_y), max_y) >= LOG_TRANSFORMATION_THRESHOLD:
|
||||
log_vw_examples = []
|
||||
for v in vw_examples:
|
||||
org_y = v.split('|')[0]
|
||||
y = float(v.split('|')[0])
|
||||
# shift y to ensure all y are positive
|
||||
if min_y <= 0:
|
||||
y = y + abs(min_y) + 1
|
||||
log_y = np.log(y)
|
||||
log_vw = v.replace(org_y + '|', str(log_y) + ' |')
|
||||
log_vw_examples.append(log_vw)
|
||||
logger.info('log_vw_examples %s', log_vw_examples[0:2])
|
||||
if log_vw_examples:
|
||||
return log_vw_examples
|
||||
return vw_examples, Y
|
||||
|
||||
|
||||
class VowpalWabbitNamesspaceTuningProblem:
|
||||
|
||||
def __init__(self, max_iter_num, dataset_id, ns_num, **kwargs):
|
||||
use_log = kwargs.get('use_log', True),
|
||||
shuffle = kwargs.get('shuffle', False)
|
||||
vw_format = kwargs.get('vw_format', True)
|
||||
print('dataset_id', dataset_id)
|
||||
self.vw_examples, self.Y = get_data(max_iter_num, dataset_id=dataset_id,
|
||||
vw_format=vw_format, max_ns_num=ns_num,
|
||||
shuffle=shuffle, use_log=use_log
|
||||
)
|
||||
self.max_iter_num = min(max_iter_num, len(self.Y))
|
||||
self._problem_info = {'max_iter_num': self.max_iter_num,
|
||||
'dataset_id': dataset_id,
|
||||
'ns_num': ns_num,
|
||||
}
|
||||
self._problem_info.update(kwargs)
|
||||
self._fixed_hp_config = kwargs.get('fixed_hp_config', {})
|
||||
self.namespace_feature_dim = AutoVW.get_ns_feature_dim_from_vw_example(self.vw_examples[0])
|
||||
self._raw_namespaces = list(self.namespace_feature_dim.keys())
|
||||
self._setup_search()
|
||||
|
||||
def _setup_search(self):
|
||||
self._search_space = self._fixed_hp_config.copy()
|
||||
self._init_config = self._fixed_hp_config.copy()
|
||||
search_space = {'interactions':
|
||||
polynomial_expansion_set(
|
||||
init_monomials=set(self._raw_namespaces),
|
||||
highest_poly_order=len(self._raw_namespaces),
|
||||
allow_self_inter=False),
|
||||
}
|
||||
init_config = {'interactions': set()}
|
||||
self._search_space.update(search_space)
|
||||
self._init_config.update(init_config)
|
||||
logger.info('search space %s %s %s', self._search_space, self._init_config, self._fixed_hp_config)
|
||||
|
||||
@property
|
||||
def init_config(self):
|
||||
return self._init_config
|
||||
|
||||
@property
|
||||
def search_space(self):
|
||||
return self._search_space
|
||||
|
||||
|
||||
class VowpalWabbitNamesspaceLRTuningProblem(VowpalWabbitNamesspaceTuningProblem):
|
||||
|
||||
def __init__(self, max_iter_num, dataset_id, ns_num, **kwargs):
|
||||
super().__init__(max_iter_num, dataset_id, ns_num, **kwargs)
|
||||
self._setup_search()
|
||||
|
||||
def _setup_search(self):
|
||||
self._search_space = self._fixed_hp_config.copy()
|
||||
self._init_config = self._fixed_hp_config.copy()
|
||||
search_space = {'interactions':
|
||||
polynomial_expansion_set(
|
||||
init_monomials=set(self._raw_namespaces),
|
||||
highest_poly_order=len(self._raw_namespaces),
|
||||
allow_self_inter=False),
|
||||
'learning_rate': loguniform(lower=2e-10, upper=1.0)
|
||||
}
|
||||
init_config = {'interactions': set(), 'learning_rate': 0.5}
|
||||
self._search_space.update(search_space)
|
||||
self._init_config.update(init_config)
|
||||
logger.info('search space %s %s %s', self._search_space, self._init_config, self._fixed_hp_config)
|
||||
|
||||
|
||||
def get_y_from_vw_example(vw_example):
|
||||
""" get y from a vw_example. this works for regression dataset
|
||||
"""
|
||||
return float(vw_example.split('|')[0])
|
||||
|
||||
|
||||
def get_loss(y_pred, y_true, loss_func='squared'):
|
||||
if 'squared' in loss_func:
|
||||
loss = mean_squared_error([y_pred], [y_true])
|
||||
elif 'absolute' in loss_func:
|
||||
loss = mean_absolute_error([y_pred], [y_true])
|
||||
else:
|
||||
loss = None
|
||||
raise NotImplementedError
|
||||
return loss
|
||||
|
||||
|
||||
def online_learning_loop(iter_num, vw_examples, vw_alg, loss_func, method_name=''):
|
||||
"""Implements the online learning loop.
|
||||
Args:
|
||||
iter_num (int): The total number of iterations
|
||||
vw_examples (list): A list of vw examples
|
||||
alg (alg instance): An algorithm instance has the following functions:
|
||||
- alg.learn(example)
|
||||
- alg.predict(example)
|
||||
loss_func (str): loss function
|
||||
Outputs:
|
||||
cumulative_loss_list (list): the list of cumulative loss from each iteration.
|
||||
It is returned for the convenience of visualization.
|
||||
"""
|
||||
print('rerunning exp....', len(vw_examples), iter_num)
|
||||
loss_list = []
|
||||
y_predict_list = []
|
||||
for i in range(iter_num):
|
||||
vw_x = vw_examples[i]
|
||||
y_true = get_y_from_vw_example(vw_x)
|
||||
# predict step
|
||||
y_pred = vw_alg.predict(vw_x)
|
||||
# learn step
|
||||
vw_alg.learn(vw_x)
|
||||
# calculate one step loss
|
||||
loss = get_loss(y_pred, y_true, loss_func)
|
||||
loss_list.append(loss)
|
||||
y_predict_list.append([y_pred, y_true])
|
||||
|
||||
return loss_list
|
||||
|
||||
|
||||
def get_vw_tuning_problem(tuning_hp='NamesapceInteraction'):
|
||||
online_vw_exp_setting = {"max_live_model_num": 5,
|
||||
"fixed_hp_config": {'alg': 'supervised', 'loss_function': 'squared'},
|
||||
"ns_num": 10,
|
||||
"max_iter_num": 10000,
|
||||
}
|
||||
|
||||
# construct openml problem setting based on basic experiment setting
|
||||
vw_oml_problem_args = {"max_iter_num": online_vw_exp_setting['max_iter_num'],
|
||||
"dataset_id": '42183',
|
||||
"ns_num": online_vw_exp_setting['ns_num'],
|
||||
"fixed_hp_config": online_vw_exp_setting['fixed_hp_config'],
|
||||
}
|
||||
if tuning_hp == 'NamesapceInteraction':
|
||||
vw_online_aml_problem = VowpalWabbitNamesspaceTuningProblem(**vw_oml_problem_args)
|
||||
elif tuning_hp == 'NamesapceInteraction+LearningRate':
|
||||
vw_online_aml_problem = VowpalWabbitNamesspaceLRTuningProblem(**vw_oml_problem_args)
|
||||
else:
|
||||
NotImplementedError
|
||||
|
||||
return vw_oml_problem_args, vw_online_aml_problem
|
||||
|
||||
|
||||
class TestAutoVW(unittest.TestCase):
|
||||
|
||||
def test_vw_oml_problem_and_vanilla_vw(self):
|
||||
vw_oml_problem_args, vw_online_aml_problem = get_vw_tuning_problem()
|
||||
vanilla_vw = pyvw.vw(**vw_oml_problem_args["fixed_hp_config"])
|
||||
cumulative_loss_list = online_learning_loop(vw_online_aml_problem.max_iter_num,
|
||||
vw_online_aml_problem.vw_examples,
|
||||
vanilla_vw,
|
||||
loss_func=vw_oml_problem_args["fixed_hp_config"].get("loss_function", "squared"),
|
||||
)
|
||||
print('final average loss:', sum(cumulative_loss_list) / len(cumulative_loss_list))
|
||||
|
||||
def test_supervised_vw_tune_namespace(self):
|
||||
# basic experiment setting
|
||||
vw_oml_problem_args, vw_online_aml_problem = get_vw_tuning_problem()
|
||||
autovw = AutoVW(max_live_model_num=5,
|
||||
search_space=vw_online_aml_problem.search_space,
|
||||
init_config=vw_online_aml_problem.init_config,
|
||||
min_resource_lease='auto',
|
||||
random_seed=2345)
|
||||
|
||||
cumulative_loss_list = online_learning_loop(vw_online_aml_problem.max_iter_num,
|
||||
vw_online_aml_problem.vw_examples,
|
||||
autovw,
|
||||
loss_func=vw_oml_problem_args["fixed_hp_config"].get("loss_function", "squared"),
|
||||
)
|
||||
print('final average loss:', sum(cumulative_loss_list) / len(cumulative_loss_list))
|
||||
|
||||
def test_supervised_vw_tune_namespace_learningrate(self):
|
||||
# basic experiment setting
|
||||
vw_oml_problem_args, vw_online_aml_problem = get_vw_tuning_problem(tuning_hp='NamesapceInteraction+LearningRate')
|
||||
autovw = AutoVW(max_live_model_num=5,
|
||||
search_space=vw_online_aml_problem.search_space,
|
||||
init_config=vw_online_aml_problem.init_config,
|
||||
min_resource_lease='auto',
|
||||
random_seed=2345)
|
||||
|
||||
cumulative_loss_list = online_learning_loop(vw_online_aml_problem.max_iter_num,
|
||||
vw_online_aml_problem.vw_examples,
|
||||
autovw,
|
||||
loss_func=vw_oml_problem_args["fixed_hp_config"].get("loss_function", "squared"),
|
||||
)
|
||||
print('final average loss:', sum(cumulative_loss_list) / len(cumulative_loss_list))
|
||||
|
||||
def test_bandit_vw_tune_namespace(self):
|
||||
pass
|
||||
|
||||
def test_bandit_vw_tune_namespace_learningrate(self):
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user