[CI] Reenable torchinductor workflow (#2527)

2026-04-05 03:01:17 -04:00 · 2023-10-26 02:44:02 -04:00
parent 4c816c2f59
commit bc72294507
9 changed files with 91 additions and 172 deletions
--- a/.github/workflows/torch-inductor-tests.yml
+++ b/.github/workflows/torch-inductor-tests.yml
@@ -1,7 +1,9 @@
 name: Torchinductor

 on:
-  workflow_dispatch:
+  workflow_run:
+    workflows: ["Wheel"]
+    types: [completed]

 jobs:
  Runner-Preparation:
@@ -23,17 +25,17 @@ jobs:
    steps:
      - name: Checkout
        uses: actions/checkout@v2
-      #- name: Packages
-      #  run: |
-      #    ./.github/workflows/torchinductor/scripts/install_torchinductor.sh
+      - name: Packages
+        run: |
+          ./.github/workflows/torch-inductor/scripts/install_torchinductor.sh torchbench
      - name: Environment
        run: |
          source /opt/torchinductor_venv/bin/activate
-          ./.github/workflows/torchinductor/scripts/install_triton.sh
+          ./.github/workflows/torch-inductor/scripts/install_triton.sh
      - name: Performance
        run: |
-          ./.github/workflows/torchinductor/scripts/run_torchinductor_perf.sh
+          ./.github/workflows/torch-inductor/scripts/run_torchinductor_perf.sh torchbench
      # Runs too long time
      #- name: Accuracy
      #  run: |
-      #    ./.github/workflows/torchinductor/scripts/run_torchinductor_acc.sh
+      #    ./.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh
--- a/.github/workflows/torch-inductor/data/huggingface.csv
+++ b/.github/workflows/torch-inductor/data/huggingface.csv
@@ -1,37 +0,0 @@
-dev,name,batch_size,speedup,abs_latency,compilation_latency,compression_ratio
-cuda,AlbertForMaskedLM,4,1.5511,164.3373,26.8523,1.2647
-cuda,AlbertForQuestionAnswering,4,1.5501,163.5580,25.7983,1.3145
-cuda,BartForCausalLM,4,1.5080,71.7230,32.8907,0.9749
-cuda,BertForMaskedLM,16,1.5350,67.9451,35.3286,1.0494
-cuda,BertForQuestionAnswering,16,1.6735,53.2963,34.3754,1.1710
-cuda,BlenderbotSmallForCausalLM,64,1.2106,46.6466,23.8058,0.9120
-cuda,BlenderbotSmallForConditionalGeneration,64,1.3616,77.3013,55.3546,0.9803
-cuda,CamemBert,16,1.4779,76.1809,35.3883,1.0469
-cuda,DebertaForMaskedLM,4,0.8415,62.3395,35.9657,1.0418
-cuda,DebertaForQuestionAnswering,8,1.0609,67.5151,35.7728,1.1528
-cuda,DebertaV2ForMaskedLM,1,0.6026,134.6517,66.1783,0.9773
-cuda,DistilBertForMaskedLM,128,1.2460,66.9382,18.3089,0.9624
-cuda,DistilBertForQuestionAnswering,256,1.3997,72.4126,18.1956,1.1486
-cuda,DistillGPT2,16,1.6656,60.5455,17.2280,1.0641
-cuda,ElectraForCausalLM,32,1.8299,45.4841,37.0944,0.9717
-cuda,ElectraForQuestionAnswering,64,2.0289,52.6890,35.9632,1.1928
-cuda,GPT2ForSequenceClassification,4,2.2567,38.2969,30.0527,1.2323
-cuda,LayoutLMForMaskedLM,16,1.5423,68.8018,36.5562,1.0495
-cuda,LayoutLMForSequenceClassification,16,1.7058,53.9355,35.2225,1.1659
-cuda,MBartForCausalLM,4,1.4945,71.4649,32.8653,0.9830
-cuda,MegatronBertForCausalLM,4,1.4328,58.4404,70.6226,1.0951
-cuda,MegatronBertForQuestionAnswering,8,1.5886,85.2533,69.1219,1.1152
-cuda,MobileBertForMaskedLM,64,0.9007,131.7379,107.5275,1.0136
-cuda,MobileBertForQuestionAnswering,128,0.8435,167.9066,106.7049,0.8579
-cuda,PLBartForCausalLM,8,1.5261,68.9224,19.5826,0.9887
-cuda,PLBartForConditionalGeneration,4,1.5298,71.2811,45.6902,1.0495
-cuda,PegasusForCausalLM,32,1.2212,57.5436,33.3863,0.9736
-cuda,PegasusForConditionalGeneration,32,1.2822,106.4678,69.8825,1.0689
-cuda,RobertaForCausalLM,16,1.6128,67.5706,34.7355,1.0496
-cuda,RobertaForQuestionAnswering,16,1.6800,53.6267,33.8527,1.1704
-cuda,Speech2Text2ForCausalLM,256,1.8230,32.9145,18.7201,0.8760
-cuda,T5ForConditionalGeneration,4,1.6592,59.5324,39.4406,1.1814
-cuda,T5Small,4,1.6581,59.5930,37.0471,1.1814
-cuda,TrOCRForCausalLM,32,1.2586,106.2633,32.5330,0.9583
-cuda,XLNetLMHeadModel,8,1.8108,142.8795,84.8197,1.1240
-cuda,YituTechConvBert,16,1.5207,81.4595,53.1565,1.0362
--- a/.github/workflows/torch-inductor/data/timm_models.csv
+++ b/.github/workflows/torch-inductor/data/timm_models.csv
@@ -1,54 +0,0 @@
-dev,name,batch_size,speedup,abs_latency,compilation_latency,compression_ratio
-cuda,adv_inception_v3,128,1.5923,102.5292,51.6032,1.0472
-cuda,beit_base_patch16_224,64,1.3390,75.3027,29.7471,1.0156
-cuda,coat_lite_mini,128,2.0579,53.3689,37.1856,1.0437
-cuda,convmixer_768_32,32,1.0470,275.5328,23.8037,0.9999
-cuda,convnext_base,64,1.5084,80.1811,42.5659,1.0373
-cuda,crossvit_9_240,128,1.5392,37.1806,44.9986,0.9193
-cuda,cspdarknet53,64,1.4721,75.0403,35.2882,1.0547
-cuda,deit_base_distilled_patch16_224,64,1.1432,55.9737,23.4038,0.9816
-cuda,dla102,128,1.5282,123.7284,49.3612,1.0430
-cuda,dm_nfnet_f0,128,1.4354,79.7518,34.8994,1.1038
-cuda,dpn107,32,1.2412,83.8921,58.9111,0.9952
-cuda,eca_botnext26ts_256,128,1.5425,71.2406,28.8920,1.0270
-cuda,ese_vovnet19b_dw,128,1.4647,42.4837,18.0285,1.0135
-cuda,fbnetc_100,128,1.5795,53.8033,33.0222,1.0082
-cuda,gernet_l,128,1.1684,63.4230,26.8687,1.0053
-cuda,ghostnet_100,128,1.7812,54.4211,47.6168,1.0484
-cuda,gluon_inception_v3,128,1.5952,102.5018,50.0857,1.0469
-cuda,gmixer_24_224,128,1.6749,69.2430,42.0841,1.1921
-cuda,gmlp_s16_224,128,1.5886,79.2132,43.0142,1.2343
-cuda,hrnet_w18,128,1.3743,221.5304,134.2573,1.0100
-cuda,inception_v3,128,1.5847,102.8333,49.7648,1.0472
-cuda,jx_nest_base,32,1.3747,71.4190,61.4053,0.9905
-cuda,lcnet_050,128,1.8159,18.0047,18.8249,1.0005
-cuda,mixer_b16_224,128,1.2795,90.9229,21.0438,1.0133
-cuda,mixnet_l,128,1.2273,149.9722,47.7482,1.0129
-cuda,mnasnet_100,128,1.6594,40.0512,26.5165,1.0047
-cuda,mobilenetv2_100,128,1.6085,41.1217,27.4450,1.1731
-cuda,mobilenetv3_large_100,128,1.6610,37.9995,29.8185,1.0052
-cuda,mobilevit_s,64,1.5212,55.4152,53.6475,1.0258
-cuda,nfnet_l0,128,1.4927,65.7078,32.4067,0.9980
-cuda,pit_b_224,64,1.2286,57.9484,26.5321,0.9606
-cuda,pnasnet5large,16,1.0000,198.2494,93.4641,1.3184
-cuda,poolformer_m36,64,1.3486,103.9235,62.3196,1.1942
-cuda,regnety_002,128,1.3030,32.4968,27.2439,1.0014
-cuda,repvgg_a2,128,1.2485,59.7729,26.9209,1.0185
-cuda,res2net101_26w_4s,64,1.0813,94.1773,86.6520,0.9655
-cuda,res2net50_14w_8s,128,1.3251,109.5258,79.9578,0.9830
-cuda,res2next50,128,1.2518,125.5008,43.9754,0.9756
-cuda,resmlp_12_224,128,1.3060,45.2373,19.3709,1.1048
-cuda,resnest101e,64,1.4346,108.1945,78.1993,1.1037
-cuda,rexnet_100,128,1.4637,55.0121,41.2075,1.0862
-cuda,selecsls42b,128,1.4284,44.6645,23.3892,1.0139
-cuda,spnasnet_100,128,1.5908,45.3189,32.0148,1.0048
-cuda,swin_base_patch4_window7_224,64,1.6164,89.5854,75.5848,0.9299
-cuda,swsl_resnext101_32x16d,32,1.0175,110.0041,45.7853,1.0003
-cuda,tf_efficientnet_b0,128,1.5271,55.7361,34.5551,1.1079
-cuda,tf_mixnet_l,128,1.2369,155.9027,48.6695,1.0921
-cuda,tinynet_a,128,1.3792,53.0640,40.6346,1.1108
-cuda,tnt_s_patch16_224,128,3.1078,104.8486,59.6028,1.0660
-cuda,twins_pcpvt_base,64,1.5921,67.4600,84.4977,1.0909
-cuda,visformer_small,128,1.1952,72.8705,23.7303,1.0410
-cuda,vit_base_patch16_224,64,1.1309,56.4866,22.0208,0.9804
-cuda,volo_d1_224,64,1.6868,72.0957,65.3011,0.9729
--- a/.github/workflows/torch-inductor/data/torchbench.csv
+++ b/.github/workflows/torch-inductor/data/torchbench.csv
@@ -1,53 +0,0 @@
-dev,name,batch_size,speedup,abs_latency,compilation_latency,compression_ratio
-cuda,BERT_pytorch,16,1.7111,24.2741,35.7065,1.3212
-cuda,LearningToPaint,96,1.0513,10.7557,11.1879,0.9896
-cuda,Super_SloMo,6,1.3267,60.4328,28.2097,1.2392
-cuda,alexnet,128,1.1754,8.3246,5.3319,1.0003
-cuda,attention_is_all_you_need_pytorch,256,1.3416,36.4401,39.5927,1.1774
-cuda,dcgan,32,0.9151,2.6249,3.2964,1.0082
-cuda,densenet121,4,0.9225,51.3747,68.5841,0.9930
-cuda,doctr_det_predictor,0,0.0000
-cuda,doctr_reco_predictor,0,0.0000
-cuda,drq,1,0.9500,3.4884,4.8028,0.9687
-cuda,fastNLP_Bert,6,1.4328,34.7753,35.4863,1.2368
-cuda,functorch_dp_cifar10,64,1.2015,8.1625,12.9040,1.0609
-cuda,functorch_maml_omniglot,1,0.9322,2.5844,3.8640,1.0000
-cuda,hf_Albert,8,2.1228,30.3377,26.8282,1.2676
-cuda,hf_Bart,4,1.2899,39.1935,47.2373,1.0080
-cuda,hf_Bert,4,1.3262,26.1063,35.0281,1.0656
-cuda,hf_Bert_large,4,1.4163,55.1021,67.2825,1.0915
-cuda,hf_DistilBert,8,1.4051,21.7191,18.0399,1.0242
-cuda,hf_GPT2,4,1.6661,26.9039,29.9473,1.1555
-cuda,hf_Longformer,0,0.0000
-cuda,hf_Reformer,4,1.1709,64.6979,15.7035,0.9267
-cuda,hf_T5_large,2,1.7215,107.0798,148.8805,1.1684
-cuda,lennard_jones,1000,0.8428,1.8488,3.0609,1.0001
-cuda,maml_omniglot,32,0.9648,2.6869,3.9775,0.9999
-cuda,mnasnet1_0,32,1.0469,21.6251,25.8232,0.9996
-cuda,mobilenet_v2,96,1.5604,31.9572,27.0225,1.1734
-cuda,nvidia_deeprecommender,256,1.0605,9.2080,4.1318,0.9711
-cuda,phlippe_densenet,128,1.0237,27.5988,28.0400,1.0023
-cuda,phlippe_resnet,128,1.0493,10.9751,10.2485,1.0092
-cuda,pytorch_CycleGAN_and_pix2pix,1,1.3724,8.2225,11.9561,1.0219
-cuda,pytorch_stargan,16,1.1835,11.9178,10.0507,1.0868
-cuda,pytorch_unet,1,1.3787,29.7543,13.7711,1.0100
-cuda,resnet152,32,0.9834,63.2446,67.7935,0.9991
-cuda,resnet18,16,0.9451,9.4977,11.7663,0.9948
-cuda,resnet50,32,1.0513,24.5141,24.6629,1.0021
-cuda,resnext50_32x4d,8,0.9216,22.2460,24.3420,0.9984
-cuda,shufflenet_v2_x1_0,128,1.1943,25.4520,28.8611,1.0951
-cuda,soft_actor_critic,256,0.8691,1.9637,3.3716,0.9996
-cuda,speech_transformer,32,1.2718,35.2922,46.9957,1.0897
-cuda,squeezenet1_1,32,1.1302,8.4540,7.9625,1.0771
-cuda,timm_efficientdet,1,1.3370,80.0377,120.1814,1.2713
-cuda,timm_efficientnet,32,1.1874,27.6302,33.9059,1.0971
-cuda,timm_nfnet,128,1.4525,77.3461,34.3270,1.1056
-cuda,timm_regnet,32,1.0644,50.6953,35.7562,1.0000
-cuda,timm_resnest,32,1.6200,14.7763,17.2245,1.0906
-cuda,timm_vision_transformer,32,1.0800,19.4188,22.0255,0.9966
-cuda,timm_vision_transformer_large,32,1.0081,393.1742,127.8083,0.9735
-cuda,timm_vovnet,32,1.1472,22.4727,22.7328,1.0120
-cuda,torchrec_dlrm,0,0.0000
-cuda,tts_angular,64,0.8974,6.5057,2.5555,0.9973
-cuda,vgg16,64,1.2909,50.7405,6.1510,0.9828
-cuda,yolov3,16,1.2930,54.8069,41.9269,1.0563
--- a/.github/workflows/torch-inductor/scripts/check_perf.py
+++ b/.github/workflows/torch-inductor/scripts/check_perf.py
@@ -33,12 +33,22 @@ def compare(baseline: dict, new: dict, threshold: float,
            print(f"New benchmark {key} not found in baseline")
        baseline_latency = baseline[key].latency
        new_latency = new[key].latency
+        if baseline_latency == 0:
+            print(f"Baseline latency for {key} is 0")
+            continue
+        elif new_latency == 0:
+            print(f"New latency for {key} is 0")
+            continue
+
        if new_latency < baseline_latency * (1 - threshold):
            print(
                f"New benchmark {key} is faster than baseline: {new_latency} vs {baseline_latency}")
        elif new_latency > baseline_latency * (1 + threshold):
            print(
                f"New benchmark {key} is slower than baseline: {new_latency} vs {baseline_latency}")
+        else:
+            print(
+                f"New benchmark {key} is within threshold: {new_latency} vs {baseline_latency}")
        baseline_geomean *= baseline[key].speedup
        new_geomean *= new[key].speedup

@@ -46,7 +56,7 @@ def compare(baseline: dict, new: dict, threshold: float,
    new_geomean = new_geomean ** (1 / len(new))
    print(f"Baseline geomean: {baseline_geomean}")
    print(f"New geomean: {new_geomean}")
-    assert new_geomean > baseline_geomean * (1 - geomean_threshold), \
+    assert new_geomean >= baseline_geomean * (1 - geomean_threshold), \
        f"New geomean is slower than baseline: {new_geomean} vs {baseline_geomean}"


--- a/.github/workflows/torch-inductor/scripts/install_torchinductor.sh
+++ b/.github/workflows/torch-inductor/scripts/install_torchinductor.sh
@@ -2,19 +2,24 @@

 # remember where we started
 ROOT="$(pwd)"
+MODEL_SPEC=$1

 # torchinductor venv
 whoami
+# clean up old venv
+rm -rf /opt/torchinductor_venv
 python3 -m venv /opt/torchinductor_venv
 # shellcheck source=/dev/null
 source /opt/torchinductor_venv/bin/activate
 # shellcheck source=/dev/null
-source ./.github/workflows/torchinductor/scripts/common.sh
+source ./.github/workflows/torch-inductor/scripts/common.sh

 # pytorch nightly
-pip3 install --force-reinstall --pre torch torchtext torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu118
+pip3 install --force-reinstall --pre torch torchtext torchvision torchaudio torchrec --extra-index-url https://download.pytorch.org/whl/nightly/cu121
 # pytorch source to get torchbench for dynamo
 cd /opt || exit
+# cleanup old pytorch
+rm -rf pytorch
 git clone --recursive https://github.com/pytorch/pytorch
 cd pytorch || exit
 # if you are updating an existing checkout
@@ -23,20 +28,31 @@ git submodule update --init --recursive
 cd ..

 # required packages
-pip3 install expecttest psutil
+# https://github.com/pytorch/benchmark/blob/main/docker/gcp-a100-runner-dind.dockerfile#L17
+sudo apt-get install --yes libpango-1.0-0 libpangoft2-1.0-0
+pip3 install --upgrade pip
+pip3 install expecttest psutil lightning-utilities pyre_extensions

 # torchbench
-pip3 install pyyaml
-git clone https://github.com/pytorch/benchmark.git
-cd benchmark || exit
-python3 install.py
-cd ..
+if [ "$MODEL_SPEC" == "torchbench" ] || [ "$MODEL_SPEC" != "all" ]; then
+	# clean up old torchbench
+	rm -rf benchmark
+	pip3 install pyyaml
+	git clone https://github.com/pytorch/benchmark.git
+	cd benchmark || exit
+	python3 install.py
+	cd ..
+fi

 # timm
-git clone https://github.com/huggingface/pytorch-image-models.git
-cd pytorch-image-models || exit
-pip3 install -e .
-cd ..
+if [ "$MODEL_SPEC" == "timm_models" ] || [ "$MODEL_SPEC" != "all" ]; then
+	# clean up old timm
+	rm -rf pytorch-image-models
+	git clone https://github.com/huggingface/pytorch-image-models.git
+	cd pytorch-image-models || exit
+	pip3 install -e .
+	cd ..
+fi

 # build our own triton
 cd "$ROOT" || exit
--- a/.github/workflows/torch-inductor/scripts/install_triton.sh
+++ b/.github/workflows/torch-inductor/scripts/install_triton.sh
@@ -6,7 +6,7 @@ ROOT="$(pwd)"
 # shellcheck source=/dev/null
 source /opt/torchinductor_venv/bin/activate
 # shellcheck source=/dev/null
-source ./.github/workflows/torchinductor/scripts/common.sh
+source ./.github/workflows/torch-inductor/scripts/common.sh

 # build our own triton
 cd python || exit
--- a/.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh
+++ b/.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh
@@ -2,7 +2,8 @@

 # remember where we started
 ROOT="$(pwd)"
-INDUCTOR="$ROOT"/.github/workflows/torchinductor
+INDUCTOR="$ROOT"/.github/workflows/torch-inductor
+MODEL_SPEC=$1

 # shellcheck source=/dev/null
 source /opt/torchinductor_venv/bin/activate
@@ -14,6 +15,9 @@ TEST_REPORTS_DIR=$TEST_REPORTS_DIR/acc
 mkdir -p "$TEST_REPORTS_DIR"

 for model in "${MODELS[@]}"; do
+  if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
+    continue
+  fi
  echo "Running accuracy test for $model"
  python3 benchmarks/dynamo/"$model".py --ci --accuracy --timing --explain --inductor --device cuda \
    --output "$TEST_REPORTS_DIR"/inference_"$model".csv
@@ -25,6 +29,9 @@ done

 cd "$ROOT" || exit
 for model in "${MODELS[@]}"; do
+  if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
+    continue
+  fi
  echo "Checking accuracy test for $model"
  python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/inference_"$model".csv
  python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/training_"$model".csv
--- a/.github/workflows/torch-inductor/scripts/run_torchinductor_perf.sh
+++ b/.github/workflows/torch-inductor/scripts/run_torchinductor_perf.sh
@@ -2,7 +2,8 @@

 # remember where we started
 ROOT="$(pwd)"
-INDUCTOR="$ROOT"/.github/workflows/torchinductor
+INDUCTOR="$ROOT"/.github/workflows/torch-inductor
+MODEL_SPEC=$1

 # shellcheck source=/dev/null
 source /opt/torchinductor_venv/bin/activate
@@ -14,19 +15,46 @@ sudo nvidia-smi -i 0 -pm 1
 sudo nvidia-smi -i 0 --lock-gpu-clocks=1350,1350

 cd "$PYTORCH_DIR" || exit
-TEST_REPORTS_DIR=$TEST_REPORTS_DIR/perf
-mkdir -p "$TEST_REPORTS_DIR"
+TRITON_TEST_REPORTS_DIR=$TEST_REPORTS_DIR/perf
+BASE_TEST_REPORTS_DIR=$TEST_REPORTS_DIR/acc
+mkdir -p "$TRITON_TEST_REPORTS_DIR"
+mkdir -p "$BASE_TEST_REPORTS_DIR"

+
+echo "Running with Triton Nightly"
 for model in "${MODELS[@]}"; do
+  if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
+    continue
+  fi
  echo "Running performance test for $model"
-  python3 benchmarks/dynamo/"$model".py --ci --training --performance --disable-cudagraphs\
-    --device cuda --inductor --amp --output "$TEST_REPORTS_DIR"/"$model".csv
+  python3 benchmarks/dynamo/"$model".py --float32 -dcuda --training --inductor --performance \
+    --output "$TRITON_TEST_REPORTS_DIR"/"$model".csv
 done

+# install pytorch-triton
+pip3 uninstall triton -y
+pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu121
+
+echo "Running with pytorch-triton"
+for model in "${MODELS[@]}"; do
+  if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
+    continue
+  fi
+  echo "Running performance test for $model"
+  python3 benchmarks/dynamo/"$model".py --float32 -dcuda --training --inductor --performance \
+    --output "$BASE_TEST_REPORTS_DIR"/"$model".csv
+done
+
+# uninstall pytorch-triton
+pip3 uninstall pytorch-triton -y
+
 cd "$ROOT" || exit
 for model in "${MODELS[@]}"; do
+  if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
+    continue
+  fi
  echo "Checking performance test for $model"
-  python3 "$INDUCTOR"/scripts/check_perf.py --new "$TEST_REPORTS_DIR"/"$model".csv --baseline "$INDUCTOR"/data/"$model".csv
+  python3 "$INDUCTOR"/scripts/check_perf.py --new "$TRITON_TEST_REPORTS_DIR"/"$model".csv --baseline "$BASE_TEST_REPORTS_DIR"/"$model".csv
  EXIT_STATUS=$?
  if [ "$EXIT_STATUS" -ne 0 ]; then
    echo "Performance test for $model failed"