first

2026-02-16 18:45:19 -05:00 · 2022-10-09 11:39:29 -04:00
parent c01e65bb48
commit 9bc32acafb
24 changed files with 511 additions and 71 deletions
--- a/test/automl/test_lexiflow.py
+++ b/test/automl/test_lexiflow.py
@@ -0,0 +1,40 @@
+from flaml import AutoML
+from flaml.data import load_openml_dataset
+
+
+def _test_lexiflow():
+
+    X_train, X_test, y_train, y_test = load_openml_dataset(
+        dataset_id=179, data_dir="test/data"
+    )
+
+    lexico_objectives = {}
+    lexico_objectives["metrics"] = ["val_loss", "pred_time"]
+    lexico_objectives["tolerances"] = {"val_loss": 0.01, "pred_time": 0.0}
+    lexico_objectives["targets"] = {"val_loss": 0.0, "pred_time": 0.0}
+    lexico_objectives["modes"] = ["min", "min"]
+
+    automl = AutoML()
+    settings = {
+        "time_budget": 100,
+        "lexico_objectives": lexico_objectives,
+        "estimator_list": ["xgboost"],
+        "use_ray": True,
+        "task": "classification",
+        "max_iter": 10000000,
+        "train_time_limit": 60,
+        "verbose": 0,
+        "eval_method": "holdout",
+        "mem_thres": 128 * (1024**3),
+        "seed": 1,
+    }
+    automl.fit(X_train=X_train, y_train=y_train, X_val=X_test, y_val=y_test, **settings)
+    print(automl.predict(X_train))
+    print(automl.model)
+    print(automl.config_history)
+    print(automl.best_iteration)
+    print(automl.best_estimator)
+
+
+if __name__ == "__main__":
+    _test_lexiflow()
--- a/test/pipeline_tuning_example/data_prep/data_prep.py
+++ b/test/pipeline_tuning_example/data_prep/data_prep.py
@@ -20,7 +20,7 @@ def main():

    logger.info(" ".join(f"{k}={v}" for k, v in vars(args).items()))

-    data_path = os.path.join(args.data, 'data.csv')
+    data_path = os.path.join(args.data, "data.csv")
    df = pd.read_csv(data_path)

    train_df, test_df = train_test_split(
--- a/test/pipeline_tuning_example/data_prep/data_prep.yaml
+++ b/test/pipeline_tuning_example/data_prep/data_prep.yaml
@@ -19,7 +19,7 @@ environment:
  os: Linux

 command: >-
-  python data_prep.py 
+  python data_prep.py
  --data {inputs.data}
  --test_train_ratio {inputs.test_train_ratio}
  --train_data {outputs.train_data}
--- a/test/pipeline_tuning_example/submit_train_pipeline.py
+++ b/test/pipeline_tuning_example/submit_train_pipeline.py
@@ -83,10 +83,10 @@ def build_and_submit_aml_pipeline(config):
    ################################################
    # load component functions
    ################################################
-    data_prep_component = Component.from_yaml(ws, yaml_file=LOCAL_DIR
-                                              / "data_prep/data_prep.yaml")
-    train_component = Component.from_yaml(ws, yaml_file=LOCAL_DIR
-                                          / "train/train.yaml")
+    data_prep_component = Component.from_yaml(
+        ws, yaml_file=LOCAL_DIR / "data_prep/data_prep.yaml"
+    )
+    train_component = Component.from_yaml(ws, yaml_file=LOCAL_DIR / "train/train.yaml")

    ################################################
    # build pipeline
--- a/test/pipeline_tuning_example/submit_tuner_pipeline.py
+++ b/test/pipeline_tuning_example/submit_tuner_pipeline.py
@@ -14,16 +14,19 @@ def remote_run():
    ################################################
    # connect to your Azure ML workspace
    ################################################
-    ws = Workspace(subscription_id=args.subscription_id,
-                   resource_group=args.resource_group,
-                   workspace_name=args.workspace)
+    ws = Workspace(
+        subscription_id=args.subscription_id,
+        resource_group=args.resource_group,
+        workspace_name=args.workspace,
+    )

    ################################################
    # load component functions
    ################################################

-    pipeline_tuning_func = Component.from_yaml(ws, yaml_file=LOCAL_DIR
-                                               / "tuner/component_spec.yaml")
+    pipeline_tuning_func = Component.from_yaml(
+        ws, yaml_file=LOCAL_DIR / "tuner/component_spec.yaml"
+    )

    ################################################
    # build pipeline
@@ -44,6 +47,7 @@ def remote_run():
 def local_run():
    logger.info("Run tuner locally.")
    from tuner import tuner_func
+
    tuner_func.tune_pipeline(concurrent_run=2)


@@ -52,15 +56,18 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_mutually_exclusive_group(required=False)
    parser.add_argument(
-        "--subscription_id", type=str, help="your_subscription_id", required=False,
+        "--subscription_id",
+        type=str,
+        help="your_subscription_id",
+        required=False,
    )
    parser.add_argument(
-        "--resource_group", type=str, help="your_resource_group", required=False)
-    parser.add_argument(
-        "--workspace", type=str, help="your_workspace", required=False)
+        "--resource_group", type=str, help="your_resource_group", required=False
+    )
+    parser.add_argument("--workspace", type=str, help="your_workspace", required=False)

-    parser.add_argument('--remote', dest='remote', action='store_true')
-    parser.add_argument('--local', dest='remote', action='store_false')
+    parser.add_argument("--remote", dest="remote", action="store_true")
+    parser.add_argument("--local", dest="remote", action="store_false")
    parser.set_defaults(remote=True)
    args = parser.parse_args()

--- a/test/pipeline_tuning_example/train/train.py
+++ b/test/pipeline_tuning_example/train/train.py
@@ -5,7 +5,7 @@ import pandas as pd
 from azureml.core import Run


-class LightGBMCallbackHandler():
+class LightGBMCallbackHandler:
    def __init__(self):
        pass

@@ -24,16 +24,22 @@ class LightGBMCallbackHandler():
 def main(args):
    """Main function of the script."""

-    train_path = os.path.join(args.train_data, 'data.csv')
+    train_path = os.path.join(args.train_data, "data.csv")
    print("traning_path:", train_path)

-    test_path = os.path.join(args.test_data, 'data.csv')
+    test_path = os.path.join(args.test_data, "data.csv")

    train_set = lgb.Dataset(train_path)
    test_set = lgb.Dataset(test_path)
    callbacks_handler = LightGBMCallbackHandler()
-    config = {"header": True, "objective": "binary", "label_column": 30, "metric": "binary_error",
-              "n_estimators": args.n_estimators, "learning_rate": args.learning_rate}
+    config = {
+        "header": True,
+        "objective": "binary",
+        "label_column": 30,
+        "metric": "binary_error",
+        "n_estimators": args.n_estimators,
+        "learning_rate": args.learning_rate,
+    }
    gbm = lgb.train(
        config,
        train_set,
@@ -44,9 +50,9 @@ def main(args):
        ],
    )

-    print('Saving model...')
+    print("Saving model...")
    # save model to file
-    gbm.save_model(os.path.join(args.model, 'model.txt'))
+    gbm.save_model(os.path.join(args.model, "model.txt"))


 if __name__ == "__main__":
--- a/test/pipeline_tuning_example/train/train.yaml
+++ b/test/pipeline_tuning_example/train/train.yaml
@@ -4,9 +4,9 @@ name: classifier
 version: 0.0.1
 display_name: Train lgbm classifier
 inputs:
-  train_data: 
+  train_data:
    type: path
-  test_data: 
+  test_data:
    type: path
  learning_rate:
    type: float
@@ -20,8 +20,8 @@ environment:
  conda_dependencies_file: env.yaml
 os: Linux
 command: >-
-  python train.py 
-  --train_data {inputs.train_data} 
+  python train.py
+  --train_data {inputs.train_data}
  --test_data {inputs.test_data}
  --learning_rate {inputs.learning_rate}
  --n_estimators {inputs.n_estimators}
--- a/test/pipeline_tuning_example/tuner/component_spec.yaml
+++ b/test/pipeline_tuning_example/tuner/component_spec.yaml
@@ -9,4 +9,4 @@ environment:
  conda_dependencies_file: env.yaml
 os: Linux
 command: >-
-  python tuner/tuner_func.py 
+  python tuner/tuner_func.py
--- a/test/pipeline_tuning_example/tuner/tuner_func.py
+++ b/test/pipeline_tuning_example/tuner/tuner_func.py
@@ -8,8 +8,7 @@ logger = logging.getLogger(__name__)


 def run_with_config(config: dict):
-    """Run the pipeline with a given config dict
-    """
+    """Run the pipeline with a given config dict"""

    # pass the hyperparameters to AzureML jobs by overwriting the config file.
    overrides = [f"{key}={value}" for key, value in config.items()]
@@ -24,25 +23,25 @@ def run_with_config(config: dict):
    while not stop:
        # get status
        status = run._core_run.get_status()
-        print(f'status: {status}')
+        print(f"status: {status}")

        # get metrics
        metrics = run._core_run.get_metrics(recursive=True)
        if metrics:
            run_metrics = list(metrics.values())

-            new_metric = run_metrics[0]['eval_binary_error']
+            new_metric = run_metrics[0]["eval_binary_error"]

            if type(new_metric) == list:
                new_metric = new_metric[-1]

-            print(f'eval_binary_error: {new_metric}')
+            print(f"eval_binary_error: {new_metric}")

            tune.report(eval_binary_error=new_metric)

        time.sleep(5)

-        if status == 'FAILED' or status == 'Completed':
+        if status == "FAILED" or status == "Completed":
            stop = True

    print("The run is terminated.")
--- a/test/tune/test_lexiflow.py
+++ b/test/tune/test_lexiflow.py
@@ -0,0 +1,128 @@
+import torch
+import thop
+import torch.nn as nn
+from flaml import tune
+import torch.nn.functional as F
+import torchvision
+import numpy as np
+import time
+from ray import tune as raytune
+
+DEVICE = torch.device("cpu")
+BATCHSIZE = 128
+N_TRAIN_EXAMPLES = BATCHSIZE * 30
+N_VALID_EXAMPLES = BATCHSIZE * 10
+
+
+def _test_lexiflow():
+    train_dataset = torchvision.datasets.FashionMNIST(
+        "test/data",
+        train=True,
+        download=True,
+        transform=torchvision.transforms.ToTensor(),
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        torch.utils.data.Subset(train_dataset, list(range(N_TRAIN_EXAMPLES))),
+        batch_size=BATCHSIZE,
+        shuffle=True,
+    )
+
+    val_dataset = torchvision.datasets.FashionMNIST(
+        "test/data", train=False, transform=torchvision.transforms.ToTensor()
+    )
+
+    val_loader = torch.utils.data.DataLoader(
+        torch.utils.data.Subset(val_dataset, list(range(N_VALID_EXAMPLES))),
+        batch_size=BATCHSIZE,
+        shuffle=True,
+    )
+
+    def define_model(configuration):
+        n_layers = configuration["n_layers"]
+        layers = []
+        in_features = 28 * 28
+        for i in range(n_layers):
+            out_features = configuration["n_units_l{}".format(i)]
+            layers.append(nn.Linear(in_features, out_features))
+            layers.append(nn.ReLU())
+            p = configuration["dropout_{}".format(i)]
+            layers.append(nn.Dropout(p))
+            in_features = out_features
+        layers.append(nn.Linear(in_features, 10))
+        layers.append(nn.LogSoftmax(dim=1))
+        return nn.Sequential(*layers)
+
+    def train_model(model, optimizer, train_loader):
+        model.train()
+        for batch_idx, (data, target) in enumerate(train_loader):
+            data, target = data.view(-1, 28 * 28).to(DEVICE), target.to(DEVICE)
+            optimizer.zero_grad()
+            F.nll_loss(model(data), target).backward()
+            optimizer.step()
+
+    def eval_model(model, valid_loader):
+        model.eval()
+        correct = 0
+        with torch.no_grad():
+            for batch_idx, (data, target) in enumerate(valid_loader):
+                data, target = data.view(-1, 28 * 28).to(DEVICE), target.to(DEVICE)
+                pred = model(data).argmax(dim=1, keepdim=True)
+                correct += pred.eq(target.view_as(pred)).sum().item()
+
+        accuracy = correct / N_VALID_EXAMPLES
+        flops, params = thop.profile(
+            model, inputs=(torch.randn(1, 28 * 28).to(DEVICE),), verbose=False
+        )
+        return np.log2(flops), 1 - accuracy, params
+
+    def evaluate_function(configuration):
+        model = define_model(configuration).to(DEVICE)
+        optimizer = torch.optim.Adam(model.parameters(), configuration["lr"])
+        n_epoch = configuration["n_epoch"]
+        for epoch in range(n_epoch):
+            train_model(model, optimizer, train_loader)
+        flops, error_rate, params = eval_model(model, val_loader)
+        return {"error_rate": error_rate, "flops": flops, "params": params}
+
+    lexico_objectives = {}
+    lexico_objectives["metrics"] = ["error_rate", "flops"]
+    lexico_objectives["tolerances"] = {"error_rate": 0.02, "flops": 0.0}
+    lexico_objectives["targets"] = {"error_rate": 0.0, "flops": 0.0}
+    lexico_objectives["modes"] = ["min", "min"]
+
+    search_space = {
+        "n_layers": raytune.randint(lower=1, upper=3),
+        "n_units_l0": raytune.randint(lower=4, upper=128),
+        "n_units_l1": raytune.randint(lower=4, upper=128),
+        "n_units_l2": raytune.randint(lower=4, upper=128),
+        "dropout_0": raytune.uniform(lower=0.2, upper=0.5),
+        "dropout_1": raytune.uniform(lower=0.2, upper=0.5),
+        "dropout_2": raytune.uniform(lower=0.2, upper=0.5),
+        "lr": raytune.loguniform(lower=1e-5, upper=1e-1),
+        "n_epoch": raytune.randint(lower=1, upper=20),
+    }
+
+    low_cost_partial_config = {
+        "n_layers": 1,
+        "n_units_l0": 4,
+        "n_units_l1": 4,
+        "n_units_l2": 4,
+        "n_epoch": 1,
+    }
+
+    analysis = tune.run(
+        evaluate_function,
+        num_samples=100000000,
+        time_budget_s=100,
+        config=search_space,
+        use_ray=False,
+        lexico_objectives=lexico_objectives,
+        low_cost_partial_config=low_cost_partial_config,
+    )
+    result = analysis.best_result
+    print(result)
+
+
+if __name__ == "__main__":
+    _test_lexiflow()