Add pipeline tuner component and dependencies. (#671)

* add pipeline tuner component and dependencies.

* clean code.

* do not need force rerun.

* replace the resources.

* update metrics retrieving.

* Update test/pipeline_tuning_example/requirements.txt

* Update test/pipeline_tuning_example/train/env.yaml

* Update test/pipeline_tuning_example/tuner/env.yaml

* Update test/pipeline_tuning_example/tuner/tuner_func.py

* Update test/pipeline_tuning_example/data_prep/env.yaml

* fix issues found by lint with flake8.

* add documentation

* add data.

* do not need AML resource for local run.

* AML -> AzureML

* clean code.

* Update website/docs/Examples/Tune-AzureML pipeline.md

* rename and add pip install.

* update figure name.

* align docs with code.

* remove extra line.
This commit is contained in:
Rui Zhuang
2022-08-10 20:20:21 -07:00
committed by GitHub
parent 816a82a115
commit b6e8b9ccca
16 changed files with 1304 additions and 0 deletions

View File

@@ -0,0 +1,38 @@
import os
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split
import logging
logger = logging.getLogger(__name__)
def main():
"""Main function of the script."""
# input and output arguments
parser = argparse.ArgumentParser()
parser.add_argument("--data", type=str, help="path to input data")
parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
parser.add_argument("--train_data", type=str, help="path to train data")
parser.add_argument("--test_data", type=str, help="path to test data")
args = parser.parse_args()
logger.info(" ".join(f"{k}={v}" for k, v in vars(args).items()))
data_path = os.path.join(args.data, 'data.csv')
df = pd.read_csv(data_path)
train_df, test_df = train_test_split(
df,
test_size=args.test_train_ratio,
)
# output paths are mounted as folder, therefore, we are adding a filename to the path
train_df.to_csv(os.path.join(args.train_data, "data.csv"), index=False)
test_df.to_csv(os.path.join(args.test_data, "data.csv"), index=False)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,26 @@
$schema: https://componentsdk.azureedge.net/jsonschema/CommandComponent.json
name: data_prep
version: 0.0.1
display_name: Data preparation for training
type: CommandComponent
inputs:
data:
type: path
test_train_ratio:
type: float
outputs:
train_data:
type: path
test_data:
type: path
environment:
conda:
conda_dependencies_file: env.yaml
os: Linux
command: >-
python data_prep.py
--data {inputs.data}
--test_train_ratio {inputs.test_train_ratio}
--train_data {outputs.train_data}
--test_data {outputs.test_data}

View File

@@ -0,0 +1,15 @@
name: data-prep-env
channels:
- conda-forge
dependencies:
- python=3.8
- numpy=1.21.2
- pip=21.2.4
- scikit-learn=0.24.2
- scipy=1.7.1
- pandas>=1.1,<1.2
- pip:
# - inference-schema[numpy-support]==1.3.0
# - xlrd==2.0.1
- mlflow==1.26.1
- azureml-mlflow==1.42.0