Merge commit 'cb3d79a185e40c9d8a579bea07747a8a8d157d52' into ifu-231117

Conflicts: lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp lib/Dialect/TritonGPU/IR/Dialect.cpp python/setup.py python/test/unit/language/assert_helper.py python/test/unit/operators/test_flash_attention.py python/test/unit/runtime/test_subproc.py python/triton/compiler/compiler.py python/triton/language/semantic.py python/triton/runtime/autotuner.py python/triton/runtime/jit.py python/tutorials/03-matrix-multiplication.py python/tutorials/05-layer-norm.py python/tutorials/06-fused-attention.py python/tutorials/11-grouped-gemm.py test/Conversion/tritongpu_to_llvm.mlir
2026-04-05 03:01:17 -04:00 · 2023-11-17 20:42:12 +00:00
parent e1513b34e1 cb3d79a185
commit 5c87f363e4
179 changed files with 10116 additions and 6835 deletions
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -34,7 +34,7 @@ jobs:
          fi


-  Integration-Tests-Nvidia:
+  Integration-Tests:
    needs: Runner-Preparation

    runs-on: ${{ matrix.runner }}
@@ -73,10 +73,10 @@ jobs:
        run: |
          cd python
          python3 -m pip install --upgrade pip
-          python3 -m pip install cmake==3.24
-          python3 -m pip install ninja
-          python3 -m pip install --no-build-isolation -vvv '.[tests]'
-          python3 -m pip install pytest-xdist
+          python3 -m pip install cmake==3.24 ninja pytest-xdist
+          sudo apt-get update -y
+          sudo apt-get install -y ccache clang lld
+          TRITON_BUILD_WITH_CLANG_LLD=true TRITON_BUILD_WITH_CCACHE=true python3 -m pip install --no-build-isolation -vvv '.[tests]'

      - name: Run lit tests
        if: ${{ env.BACKEND == 'CUDA'}}
@@ -171,140 +171,8 @@ jobs:
          python3 -m pytest -vs . --reruns 10
          sudo nvidia-smi -i 0 -rgc

-  Integration-Tests-Shared-Middle-Layer:
-
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-
-      - name: Clear cache
-        run: |
-          rm -rf ~/.triton
-
-      - name: Update PATH
-        run: |
-          echo "PATH=${HOME}/.local/bin:${PATH}" >> "${GITHUB_ENV}"
-
-      - name: Check pre-commit
-        run: |
-          python3 -m pip install --upgrade pre-commit
-          python3 -m pre_commit run --all-files --verbose
-
-      - name: Install Triton
-        run: |
-          export TRITON_CODEGEN_TRITON_SHARED=1
-          git submodule update --init --recursive
-          cd python
-          python3 -m pip install --upgrade pip
-          python3 -m pip install cmake==3.24
-          python3 -m pip install ninja
-          python3 -m pip uninstall -y triton
-          python3 setup.py build
-          python3 -m pip install --no-build-isolation -vvv '.[tests]'
-
-      - name: Run shared middle-layer lit tests
-        run: |
-          python3 -m pip install lit
-          cd python
-          LIT_TEST_DIR="build/$(ls build | grep -i cmake)/third_party/triton_shared/test"
-          if [ ! -d "${LIT_TEST_DIR}" ]; then
-            echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
-          fi
-          lit -v "${LIT_TEST_DIR}"
-
-
-  Integration-Tests-Third-Party:
-    needs: Runner-Preparation
-    if: false
-
-    runs-on: ${{ matrix.runner }}
-
-    strategy:
-      matrix:
-        runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-optional)}}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-
-      - name: Set ROCM ENV
-        if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'gfx908')}}
-        run: |
-          echo "BACKEND=ROCM" >> "${GITHUB_ENV}"
-
-      - name: Set XPU ENV
-        if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'arc770')}}
-        run: |
-          echo "BACKEND=XPU" >> "${GITHUB_ENV}"
-
-      - name: Clear cache
-        run: |
-          rm -rf ~/.triton
-
-      - name: Update PATH
-        run: |
-          echo "PATH=${HOME}/.local/bin:${PATH}" >> "${GITHUB_ENV}"
-
-      - name: Check pre-commit
-        if: ${{ matrix.runner != 'macos-10.15' && (matrix.runner[1] != 'arc770') }}
-        run: |
-          python3 -m pip install --upgrade pre-commit
-          python3 -m pre_commit run --all-files --verbose
-
-      - name: Check pre-commit arc770
-        if: ${{ matrix.runner != 'macos-10.15' && (matrix.runner[1] == 'arc770') }}
-        run: |
-          source ${HOME}/triton_vars.sh
-          source ${HOME}/miniconda3/bin/activate
-          conda activate triton-xpu-ci
-          python3 -m pip install --upgrade pre-commit
-          python3 -m pre_commit run --all-files
-
-      - name: Install Triton on ROCM
-        if: ${{ env.BACKEND == 'ROCM'}}
-        run: |
-          git submodule update --init --recursive
-          cd python
-          python3 -m pip install --upgrade pip
-          python3 -m pip install cmake==3.24
-          python3 -m pip install torch==1.13.1 --index-url https://download.pytorch.org/whl/rocm5.2
-          export TRITON_CODEGEN_AMD_HIP_BACKEND=1
-          python3 -m pip install --no-build-isolation -vvv '.[tests]'
-
-      - name: Install Triton on XPU
-        if: ${{ env.BACKEND == 'XPU'}}
-        run: |
-          source ${HOME}/triton_vars.sh
-          source ${HOME}/miniconda3/bin/activate
-          conda activate triton-xpu-ci
-          git submodule update --init --recursive
-          cd python
-          python3 -m pip install --upgrade pip
-          python3 -m pip install cmake==3.24
-          export TRITON_CODEGEN_INTEL_XPU_BACKEND=1
-          python3 -m pip uninstall -y triton
-          python3 setup.py build
-          python3 -m pip install --no-build-isolation -vvv '.[tests]'
-
-      - name: Run python tests on ROCM
-        if: ${{ env.BACKEND == 'ROCM'}}
-        run: |
-          cd python/test/unit/language
-          python3 -m pytest --capture=tee-sys -rfs --verbose "test_core.py"
-
-      - name: Run python tests on XPU
-        if: ${{ env.BACKEND == 'XPU'}}
-        run: |
-          source ${HOME}/triton_vars.sh
-          source ${HOME}/miniconda3/bin/activate
-          conda activate triton-xpu-ci
-          cd python/test/backend/third_party_backends
-          python3 -m pytest --capture=tee-sys -rfs --verbose --backend xpu
-
  Compare-artifacts:
-    needs: Integration-Tests-Nvidia
+    needs: Integration-Tests

    runs-on: ubuntu-latest

@@ -333,7 +201,7 @@ jobs:
      - name: Download latest main artifacts
        env:
          ARTIFACT_NAME: artifacts A100
-          ARTIFACT_JOB_NAME: Integration-Tests-Nvidia
+          ARTIFACT_JOB_NAME: Integration-Tests
          MAX_NUM_ACTIONS_PAGES: 30
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
--- a/.github/workflows/llvm-build.yml
+++ b/.github/workflows/llvm-build.yml
@@ -18,43 +18,17 @@ permissions:
 jobs:

  build:
+    name: Build on ${{ matrix.config.runner }}
+    runs-on: ${{ matrix.config.runs_on }}

    strategy:
      fail-fast: true
-
      matrix:
-        platform: [
-          ubuntu-20.04-x64,
-          ubuntu-22.04-x64,
-          centos-7-x64,
-          macos-x64,
-          macos-arm64
-        ]
-
-        include:
-          # Specify OS versions
-          - platform: ubuntu-20.04-x64
-            host-os: ubuntu-20.04
-            target-os: ubuntu
-            arch: x64
-          - platform: ubuntu-22.04-x64
-            host-os: ubuntu-22.04
-            target-os: ubuntu
-            arch: x64
-          - platform: centos-7-x64
-            host-os: ubuntu-22.04
-            target-os: centos
-            arch: x64
-          - platform: macos-x64
-            host-os: macos-12
-            target-os: macos
-            arch: x64
-          - platform: macos-arm64
-            host-os: macos-12
-            target-os: macos
-            arch: arm64
-
-    runs-on: ${{ matrix.host-os }}
+        config:
+        - {runner: 'Ubuntu 20.04', runs_on: 'ubuntu-20.04', target-os: 'ubuntu', arch: 'x64'}
+        - {runner: 'CentOS 7', runs_on: ['self-hosted', 'CPU'], target-os: 'centos', arch: 'x64'}
+        - {runner: 'MacOS X64', runs_on: 'macos-12', target-os: 'macos', arch: 'x64'}
+        - {runner: 'MacOS ARM64', runs_on: 'macos-12', target-os: 'macos', arch: 'arm64'}

    steps:

@@ -73,7 +47,7 @@ jobs:
        echo "Short LLVM commit hash: ${SHORT_LLVM_COMMIT_HASH}"
        echo "short_llvm_commit_hash=${SHORT_LLVM_COMMIT_HASH}" >> ${GITHUB_ENV}

-        INSTALL_DIR="llvm-${SHORT_LLVM_COMMIT_HASH}-${{ matrix.platform }}"
+        INSTALL_DIR="llvm-${SHORT_LLVM_COMMIT_HASH}-${{ matrix.config.target-os }}-${{ matrix.config.arch }}"
        echo "LLVM installation directory name: ${INSTALL_DIR}"
        echo "llvm_install_dir=${INSTALL_DIR}" >> ${GITHUB_ENV}

@@ -99,11 +73,11 @@ jobs:
      uses: actions/cache@v3
      with:
        path: ${{ env.SCCACHE_DIR }}
-        key: ${{ matrix.platform }}-${{ env.short_llvm_commit_hash }}
-        restore-keys: ${{ matrix.platform }}-
+        key: ${{ matrix.config.target-os }}-${{ matrix.config.arch }}-${{ env.short_llvm_commit_hash }}
+        restore-keys: ${{ matrix.config.target-os }}-${{ matrix.config.arch }}-

    - name: Configure, Build, Test, and Install LLVM (Ubuntu and macOS x64)
-      if: matrix.arch == 'x64' && contains(fromJSON('["ubuntu", "macos"]'), matrix.target-os)
+      if: matrix.config.arch == 'x64' && (matrix.config.target-os == 'ubuntu' || matrix.config.target-os == 'macos')
      run: >
        python3 -m pip install -r llvm-project/mlir/python/requirements.txt

@@ -114,11 +88,13 @@ jobs:
        -DCMAKE_INSTALL_PREFIX="${{ env.llvm_install_dir }}"
        -DCMAKE_LINKER=lld
        -DLLVM_BUILD_UTILS=ON
+        -DLLVM_BUILD_TOOLS=ON
        -DLLVM_ENABLE_ASSERTIONS=ON
        -DMLIR_ENABLE_BINDINGS_PYTHON=ON
        -DLLVM_ENABLE_PROJECTS=mlir
        -DLLVM_INSTALL_UTILS=ON
        -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
+        -DLLVM_ENABLE_TERMINFO=OFF
        llvm-project/llvm

        ninja -C llvm-project/build check-mlir install
@@ -126,7 +102,7 @@ jobs:
        tar czf "${{ env.llvm_install_dir }}.tar.gz" "${{ env.llvm_install_dir }}"

    - name: Configure, Build, and Install LLVM (macOS arm64)
-      if: matrix.arch == 'arm64' && matrix.target-os == 'macos'
+      if: matrix.config.arch == 'arm64' && matrix.config.target-os == 'macos'
      run: >
        python3 -m pip install -r llvm-project/mlir/python/requirements.txt

@@ -138,6 +114,7 @@ jobs:
        -DCMAKE_LINKER=lld
        -DCMAKE_OSX_ARCHITECTURES=arm64
        -DLLVM_BUILD_UTILS=ON
+        -DLLVM_BUILD_TOOLS=ON
        -DLLVM_ENABLE_ASSERTIONS=ON
        -DMLIR_ENABLE_BINDINGS_PYTHON=ON
        -DLLVM_ENABLE_PROJECTS=mlir
@@ -145,6 +122,7 @@ jobs:
        -DLLVM_INSTALL_UTILS=ON
        -DLLVM_TARGETS_TO_BUILD="AArch64"
        -DLLVM_USE_HOST_TOOLS=ON
+        -DLLVM_ENABLE_TERMINFO=OFF
        llvm-project/llvm

        ninja -C llvm-project/build install
@@ -152,10 +130,10 @@ jobs:
        tar czf "${{ env.llvm_install_dir }}.tar.gz" "${{ env.llvm_install_dir }}"

    - name: Configure, Build, Test, and Install LLVM (CentOS)
-      if: matrix.target-os == 'centos'
+      if: matrix.config.target-os == 'centos'
      run: |
        docker build --tag llvm-build --build-arg llvm_dir=llvm-project \
-          -f llvm-build/.github/workflows/Dockerfile .
+          -f llvm-build/.github/workflows/llvm-build/Dockerfile .

        # Create temporary container to copy cache and installed artifacts.
        CONTAINER_ID=$(docker create llvm-build)
--- a/.github/workflows/llvm-build/Dockerfile
+++ b/.github/workflows/llvm-build/Dockerfile
@@ -1,16 +1,17 @@
 FROM centos:7
 ARG llvm_dir=llvm-project
-
 # Add the cache artifacts and the LLVM source tree to the container
 ADD sccache /sccache
 ADD "${llvm_dir}" /source/llvm-project
 ENV SCCACHE_DIR="/sccache"
 ENV SCCACHE_CACHE_SIZE="2G"

+RUN echo -e "[llvmtoolset-build]\nname=LLVM Toolset 13.0 - Build\nbaseurl=https://buildlogs.centos.org/c7-llvm-toolset-13.0.x86_64/\ngpgcheck=0\nenabled=1" > /etc/yum.repos.d/llvmtoolset-build.repo
 # Install build dependencies
 RUN yum install --assumeyes centos-release-scl
-RUN yum install --assumeyes devtoolset-9-gcc* python3-devel python3-pip
-SHELL [ "/usr/bin/scl", "enable", "devtoolset-9" ]
+RUN yum install --assumeyes --nogpgcheck llvm-toolset-13.0
+RUN yum install --assumeyes rh-python38-python-devel rh-python38-python-pip
+SHELL [ "/usr/bin/scl", "enable", "llvm-toolset-13.0", "rh-python38" ]

 RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install --upgrade cmake ninja sccache
@@ -21,17 +22,22 @@ RUN python3 -m pip install -r /source/llvm-project/mlir/python/requirements.txt
 # Configure, Build, Test, and Install LLVM
 RUN cmake -GNinja -Bbuild \
  -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_C_COMPILER=gcc \
-  -DCMAKE_CXX_COMPILER=g++ \
+  -DCMAKE_C_COMPILER=clang \
+  -DCMAKE_CXX_COMPILER=clang++ \
+  -DCMAKE_ASM_COMPILER=clang \
  -DCMAKE_C_COMPILER_LAUNCHER=sccache \
  -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \
+  -DCMAKE_CXX_FLAGS="-Wno-everything" \
+  -DCMAKE_LINKER=lld \
  -DCMAKE_INSTALL_PREFIX="/install" \
  -DLLVM_BUILD_UTILS=ON \
+  -DLLVM_BUILD_TOOLS=ON \
  -DLLVM_ENABLE_ASSERTIONS=ON \
  -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
  -DLLVM_ENABLE_PROJECTS=mlir \
+  -DLLVM_ENABLE_TERMINFO=OFF \
  -DLLVM_INSTALL_UTILS=ON \
  -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" \
  /source/llvm-project/llvm

-RUN ninja -C build check-mlir install
+RUN ninja -C build install
--- a/.github/workflows/third-party/integration-tests.yml
+++ b/.github/workflows/third-party/integration-tests.yml
@@ -0,0 +1,167 @@
+name: Integration Tests
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches: [main]
+  merge_group:
+    branches: [main]
+    types: [checks_requested]
+
+concurrency:
+  group: ${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/master' }}
+
+env:
+  TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
+
+jobs:
+  Runner-Preparation:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix-required: ${{ steps.set-matrix.outputs.matrix-required }}
+      matrix-optional: ${{ steps.set-matrix.outputs.matrix-optional }}
+    steps:
+      - name: Prepare runner matrix
+        id: set-matrix
+        run: |
+          if [ x"${{ github.repository }}" == x"openai/triton" ]; then
+            echo '::set-output name=matrix-required::[["self-hosted", "A100"], ["self-hosted", "H100"]]'
+            echo '::set-output name=matrix-optional::[["self-hosted", "gfx908"], ["self-hosted", "arc770"]]'
+          else
+            echo '::set-output name=matrix-required::["ubuntu-latest"]'
+            echo '::set-output name=matrix-optional::["ubuntu-latest"]'
+          fi
+
+
+  Integration-Tests-Shared-Middle-Layer:
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Clear cache
+        run: |
+          rm -rf ~/.triton
+
+      - name: Update PATH
+        run: |
+          echo "PATH=${HOME}/.local/bin:${PATH}" >> "${GITHUB_ENV}"
+
+      - name: Check pre-commit
+        run: |
+          python3 -m pip install --upgrade pre-commit
+          python3 -m pre_commit run --all-files --verbose
+
+      - name: Install Triton
+        run: |
+          export TRITON_CODEGEN_TRITON_SHARED=1
+          git submodule update --init --recursive
+          cd python
+          python3 -m pip install --upgrade pip
+          python3 -m pip install cmake==3.24
+          python3 -m pip install ninja
+          python3 -m pip uninstall -y triton
+          python3 setup.py build
+          python3 -m pip install --no-build-isolation -vvv '.[tests]'
+
+      - name: Run shared middle-layer lit tests
+        run: |
+          python3 -m pip install lit
+          cd python
+          LIT_TEST_DIR="build/$(ls build | grep -i cmake)/third_party/triton_shared/test"
+          if [ ! -d "${LIT_TEST_DIR}" ]; then
+            echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
+          fi
+          lit -v "${LIT_TEST_DIR}"
+
+
+  Integration-Tests-Third-Party:
+    needs: Runner-Preparation
+    if: false
+
+    runs-on: ${{ matrix.runner }}
+
+    strategy:
+      matrix:
+        runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-optional)}}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Set ROCM ENV
+        if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'gfx908')}}
+        run: |
+          echo "BACKEND=ROCM" >> "${GITHUB_ENV}"
+
+      - name: Set XPU ENV
+        if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'arc770')}}
+        run: |
+          echo "BACKEND=XPU" >> "${GITHUB_ENV}"
+
+      - name: Clear cache
+        run: |
+          rm -rf ~/.triton
+
+      - name: Update PATH
+        run: |
+          echo "PATH=${HOME}/.local/bin:${PATH}" >> "${GITHUB_ENV}"
+
+      - name: Check pre-commit
+        if: ${{ matrix.runner != 'macos-10.15' && (matrix.runner[1] != 'arc770') }}
+        run: |
+          python3 -m pip install --upgrade pre-commit
+          python3 -m pre_commit run --all-files --verbose
+
+      - name: Check pre-commit arc770
+        if: ${{ matrix.runner != 'macos-10.15' && (matrix.runner[1] == 'arc770') }}
+        run: |
+          source ${HOME}/triton_vars.sh
+          source ${HOME}/miniconda3/bin/activate
+          conda activate triton-xpu-ci
+          python3 -m pip install --upgrade pre-commit
+          python3 -m pre_commit run --all-files
+
+      - name: Install Triton on ROCM
+        if: ${{ env.BACKEND == 'ROCM'}}
+        run: |
+          git submodule update --init --recursive
+          cd python
+          python3 -m pip install --upgrade pip
+          python3 -m pip install cmake==3.24
+          python3 -m pip install torch==1.13.1 --index-url https://download.pytorch.org/whl/rocm5.2
+          export TRITON_CODEGEN_AMD_HIP_BACKEND=1
+          python3 -m pip install --no-build-isolation -vvv '.[tests]'
+
+      - name: Install Triton on XPU
+        if: ${{ env.BACKEND == 'XPU'}}
+        run: |
+          source ${HOME}/triton_vars.sh
+          source ${HOME}/miniconda3/bin/activate
+          conda activate triton-xpu-ci
+          git submodule update --init --recursive
+          cd python
+          python3 -m pip install --upgrade pip
+          python3 -m pip install cmake==3.24
+          export TRITON_CODEGEN_INTEL_XPU_BACKEND=1
+          python3 -m pip uninstall -y triton
+          python3 setup.py build
+          python3 -m pip install --no-build-isolation -vvv '.[tests]'
+
+      - name: Run python tests on ROCM
+        if: ${{ env.BACKEND == 'ROCM'}}
+        run: |
+          cd python/test/unit/language
+          python3 -m pytest --capture=tee-sys -rfs --verbose "test_core.py"
+
+      - name: Run python tests on XPU
+        if: ${{ env.BACKEND == 'XPU'}}
+        run: |
+          source ${HOME}/triton_vars.sh
+          source ${HOME}/miniconda3/bin/activate
+          conda activate triton-xpu-ci
+          cd python/test/backend/third_party_backends
+          python3 -m pytest --capture=tee-sys -rfs --verbose --backend xpu
--- a/.github/workflows/torch-inductor-tests.yml
+++ b/.github/workflows/torch-inductor-tests.yml
@@ -1,7 +1,9 @@
 name: Torchinductor

 on:
-  workflow_dispatch:
+  workflow_run:
+    workflows: ["Wheel"]
+    types: [completed]

 jobs:
  Runner-Preparation:
@@ -23,17 +25,17 @@ jobs:
    steps:
      - name: Checkout
        uses: actions/checkout@v2
-      #- name: Packages
-      #  run: |
-      #    ./.github/workflows/torchinductor/scripts/install_torchinductor.sh
+      - name: Packages
+        run: |
+          ./.github/workflows/torch-inductor/scripts/install_torchinductor.sh torchbench
      - name: Environment
        run: |
          source /opt/torchinductor_venv/bin/activate
-          ./.github/workflows/torchinductor/scripts/install_triton.sh
+          ./.github/workflows/torch-inductor/scripts/install_triton.sh
      - name: Performance
        run: |
-          ./.github/workflows/torchinductor/scripts/run_torchinductor_perf.sh
+          ./.github/workflows/torch-inductor/scripts/run_torchinductor_perf.sh torchbench
      # Runs too long time
      #- name: Accuracy
      #  run: |
-      #    ./.github/workflows/torchinductor/scripts/run_torchinductor_acc.sh
+      #    ./.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh
--- a/.github/workflows/torch-inductor/scripts/check_acc.py
+++ b/.github/workflows/torch-inductor/scripts/check_acc.py
--- a/.github/workflows/torch-inductor/scripts/check_perf.py
+++ b/.github/workflows/torch-inductor/scripts/check_perf.py
@@ -3,8 +3,7 @@ import csv
 from collections import namedtuple

 # Create a named tuple for the output of the benchmark
-BenchmarkOutput = namedtuple(
-    'BenchmarkOutput', ['dev', 'name', 'batch_size', 'speedup', 'latency'])
+BenchmarkOutput = namedtuple('BenchmarkOutput', ['dev', 'name', 'batch_size', 'speedup', 'latency'])


 def parse_output(file_path: str) -> dict:
@@ -19,13 +18,11 @@ def parse_output(file_path: str) -> dict:
            batch_size = row[2]
            speedup = float(row[3])
            latency = float(row[4])
-            entries[name] = BenchmarkOutput(
-                dev, name, batch_size, speedup, latency)
+            entries[name] = BenchmarkOutput(dev, name, batch_size, speedup, latency)
    return entries


-def compare(baseline: dict, new: dict, threshold: float,
-            geomean_threshold: float) -> bool:
+def compare(baseline: dict, new: dict, threshold: float, geomean_threshold: float) -> bool:
    baseline_geomean = 1.0
    new_geomean = 1.0
    for key in new:
@@ -33,20 +30,27 @@ def compare(baseline: dict, new: dict, threshold: float,
            print(f"New benchmark {key} not found in baseline")
        baseline_latency = baseline[key].latency
        new_latency = new[key].latency
+        if baseline_latency == 0:
+            print(f"Baseline latency for {key} is 0")
+            continue
+        elif new_latency == 0:
+            print(f"New latency for {key} is 0")
+            continue
+
        if new_latency < baseline_latency * (1 - threshold):
-            print(
-                f"New benchmark {key} is faster than baseline: {new_latency} vs {baseline_latency}")
+            print(f"New benchmark {key} is faster than baseline: {new_latency} vs {baseline_latency}")
        elif new_latency > baseline_latency * (1 + threshold):
-            print(
-                f"New benchmark {key} is slower than baseline: {new_latency} vs {baseline_latency}")
+            print(f"New benchmark {key} is slower than baseline: {new_latency} vs {baseline_latency}")
+        else:
+            print(f"New benchmark {key} is within threshold: {new_latency} vs {baseline_latency}")
        baseline_geomean *= baseline[key].speedup
        new_geomean *= new[key].speedup

-    baseline_geomean = baseline_geomean ** (1 / len(baseline))
-    new_geomean = new_geomean ** (1 / len(new))
+    baseline_geomean = baseline_geomean**(1 / len(baseline))
+    new_geomean = new_geomean**(1 / len(new))
    print(f"Baseline geomean: {baseline_geomean}")
    print(f"New geomean: {new_geomean}")
-    assert new_geomean > baseline_geomean * (1 - geomean_threshold), \
+    assert new_geomean >= baseline_geomean * (1 - geomean_threshold), \
        f"New geomean is slower than baseline: {new_geomean} vs {baseline_geomean}"


--- a/.github/workflows/torch-inductor/scripts/common.sh
+++ b/.github/workflows/torch-inductor/scripts/common.sh
--- a/.github/workflows/torch-inductor/scripts/install_torchinductor.sh
+++ b/.github/workflows/torch-inductor/scripts/install_torchinductor.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+# remember where we started
+ROOT="$(pwd)"
+MODEL_SPEC=$1
+
+# torchinductor venv
+whoami
+# clean up old venv
+rm -rf /opt/torchinductor_venv
+python3 -m venv /opt/torchinductor_venv
+# shellcheck source=/dev/null
+source /opt/torchinductor_venv/bin/activate
+# shellcheck source=/dev/null
+source ./.github/workflows/torch-inductor/scripts/common.sh
+
+# pytorch nightly
+pip3 install --force-reinstall --pre torch torchtext torchvision torchaudio torchrec --extra-index-url https://download.pytorch.org/whl/nightly/cu121
+# pytorch source to get torchbench for dynamo
+cd /opt || exit
+# cleanup old pytorch
+rm -rf pytorch
+git clone --recursive https://github.com/pytorch/pytorch
+cd pytorch || exit
+# if you are updating an existing checkout
+git submodule sync
+git submodule update --init --recursive
+cd ..
+
+# required packages
+# https://github.com/pytorch/benchmark/blob/main/docker/gcp-a100-runner-dind.dockerfile#L17
+sudo apt-get install --yes libpango-1.0-0 libpangoft2-1.0-0
+pip3 install --upgrade pip
+pip3 install expecttest psutil lightning-utilities pyre_extensions
+
+# torchbench
+if [ "$MODEL_SPEC" == "torchbench" ] || [ "$MODEL_SPEC" != "all" ]; then
+	# clean up old torchbench
+	rm -rf benchmark
+	pip3 install pyyaml
+	git clone https://github.com/pytorch/benchmark.git
+	cd benchmark || exit
+	python3 install.py
+	cd ..
+fi
+
+# timm
+if [ "$MODEL_SPEC" == "timm_models" ] || [ "$MODEL_SPEC" != "all" ]; then
+	# clean up old timm
+	rm -rf pytorch-image-models
+	git clone https://github.com/huggingface/pytorch-image-models.git
+	cd pytorch-image-models || exit
+	pip3 install -e .
+	cd ..
+fi
+
+# build our own triton
+cd "$ROOT" || exit
+cd python || exit
+rm -rf build
+pip3 install -e .
+pip3 uninstall pytorch-triton -y
+
+# clean up cache
+rm -rf /tmp/torchinductor_root/
+rm -rf ~/.triton/cache
+rm -rf "$TEST_REPORTS_DIR"
+
+# go back to where we started
+cd "$ROOT" || exit
--- a/.github/workflows/torch-inductor/scripts/install_triton.sh
+++ b/.github/workflows/torch-inductor/scripts/install_triton.sh
@@ -6,7 +6,7 @@ ROOT="$(pwd)"
 # shellcheck source=/dev/null
 source /opt/torchinductor_venv/bin/activate
 # shellcheck source=/dev/null
-source ./.github/workflows/torchinductor/scripts/common.sh
+source ./.github/workflows/torch-inductor/scripts/common.sh

 # build our own triton
 cd python || exit
--- a/.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh
+++ b/.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh
@@ -2,7 +2,8 @@

 # remember where we started
 ROOT="$(pwd)"
-INDUCTOR="$ROOT"/.github/workflows/torchinductor
+INDUCTOR="$ROOT"/.github/workflows/torch-inductor
+MODEL_SPEC=$1

 # shellcheck source=/dev/null
 source /opt/torchinductor_venv/bin/activate
@@ -14,6 +15,9 @@ TEST_REPORTS_DIR=$TEST_REPORTS_DIR/acc
 mkdir -p "$TEST_REPORTS_DIR"

 for model in "${MODELS[@]}"; do
+  if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
+    continue
+  fi
  echo "Running accuracy test for $model"
  python3 benchmarks/dynamo/"$model".py --ci --accuracy --timing --explain --inductor --device cuda \
    --output "$TEST_REPORTS_DIR"/inference_"$model".csv
@@ -25,6 +29,9 @@ done

 cd "$ROOT" || exit
 for model in "${MODELS[@]}"; do
+  if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
+    continue
+  fi
  echo "Checking accuracy test for $model"
  python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/inference_"$model".csv
  python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/training_"$model".csv
--- a/.github/workflows/torch-inductor/scripts/run_torchinductor_perf.sh
+++ b/.github/workflows/torch-inductor/scripts/run_torchinductor_perf.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# remember where we started
+ROOT="$(pwd)"
+INDUCTOR="$ROOT"/.github/workflows/torch-inductor
+MODEL_SPEC=$1
+
+# shellcheck source=/dev/null
+source /opt/torchinductor_venv/bin/activate
+# shellcheck source=/dev/null
+source "$INDUCTOR"/scripts/common.sh
+
+# lock GPU clocks to 1350 MHz
+sudo nvidia-smi -i 0 -pm 1
+sudo nvidia-smi -i 0 --lock-gpu-clocks=1350,1350
+
+cd "$PYTORCH_DIR" || exit
+TRITON_TEST_REPORTS_DIR=$TEST_REPORTS_DIR/perf
+BASE_TEST_REPORTS_DIR=$TEST_REPORTS_DIR/acc
+mkdir -p "$TRITON_TEST_REPORTS_DIR"
+mkdir -p "$BASE_TEST_REPORTS_DIR"
+
+
+echo "Running with Triton Nightly"
+for model in "${MODELS[@]}"; do
+  if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
+    continue
+  fi
+  echo "Running performance test for $model"
+  python3 benchmarks/dynamo/"$model".py --float32 -dcuda --training --inductor --performance \
+    --output "$TRITON_TEST_REPORTS_DIR"/"$model".csv
+done
+
+# install pytorch-triton
+pip3 uninstall triton -y
+pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu121
+
+echo "Running with pytorch-triton"
+for model in "${MODELS[@]}"; do
+  if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
+    continue
+  fi
+  echo "Running performance test for $model"
+  python3 benchmarks/dynamo/"$model".py --float32 -dcuda --training --inductor --performance \
+    --output "$BASE_TEST_REPORTS_DIR"/"$model".csv
+done
+
+# uninstall pytorch-triton
+pip3 uninstall pytorch-triton -y
+
+cd "$ROOT" || exit
+for model in "${MODELS[@]}"; do
+  if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
+    continue
+  fi
+  echo "Checking performance test for $model"
+  python3 "$INDUCTOR"/scripts/check_perf.py --new "$TRITON_TEST_REPORTS_DIR"/"$model".csv --baseline "$BASE_TEST_REPORTS_DIR"/"$model".csv
+  EXIT_STATUS=$?
+  if [ "$EXIT_STATUS" -ne 0 ]; then
+    echo "Performance test for $model failed"
+    exit "$EXIT_STATUS"
+  fi
+done
+
+# unlock GPU clocks
+sudo nvidia-smi -i 0 -rgc
+
+# go back to where we started
+cd "$ROOT" || exit
--- a/.github/workflows/torchinductor/data/huggingface.csv
+++ b/.github/workflows/torchinductor/data/huggingface.csv
@@ -1,37 +0,0 @@
-dev,name,batch_size,speedup,abs_latency,compilation_latency,compression_ratio
-cuda,AlbertForMaskedLM,4,1.5511,164.3373,26.8523,1.2647
-cuda,AlbertForQuestionAnswering,4,1.5501,163.5580,25.7983,1.3145
-cuda,BartForCausalLM,4,1.5080,71.7230,32.8907,0.9749
-cuda,BertForMaskedLM,16,1.5350,67.9451,35.3286,1.0494
-cuda,BertForQuestionAnswering,16,1.6735,53.2963,34.3754,1.1710
-cuda,BlenderbotSmallForCausalLM,64,1.2106,46.6466,23.8058,0.9120
-cuda,BlenderbotSmallForConditionalGeneration,64,1.3616,77.3013,55.3546,0.9803
-cuda,CamemBert,16,1.4779,76.1809,35.3883,1.0469
-cuda,DebertaForMaskedLM,4,0.8415,62.3395,35.9657,1.0418
-cuda,DebertaForQuestionAnswering,8,1.0609,67.5151,35.7728,1.1528
-cuda,DebertaV2ForMaskedLM,1,0.6026,134.6517,66.1783,0.9773
-cuda,DistilBertForMaskedLM,128,1.2460,66.9382,18.3089,0.9624
-cuda,DistilBertForQuestionAnswering,256,1.3997,72.4126,18.1956,1.1486
-cuda,DistillGPT2,16,1.6656,60.5455,17.2280,1.0641
-cuda,ElectraForCausalLM,32,1.8299,45.4841,37.0944,0.9717
-cuda,ElectraForQuestionAnswering,64,2.0289,52.6890,35.9632,1.1928
-cuda,GPT2ForSequenceClassification,4,2.2567,38.2969,30.0527,1.2323
-cuda,LayoutLMForMaskedLM,16,1.5423,68.8018,36.5562,1.0495
-cuda,LayoutLMForSequenceClassification,16,1.7058,53.9355,35.2225,1.1659
-cuda,MBartForCausalLM,4,1.4945,71.4649,32.8653,0.9830
-cuda,MegatronBertForCausalLM,4,1.4328,58.4404,70.6226,1.0951
-cuda,MegatronBertForQuestionAnswering,8,1.5886,85.2533,69.1219,1.1152
-cuda,MobileBertForMaskedLM,64,0.9007,131.7379,107.5275,1.0136
-cuda,MobileBertForQuestionAnswering,128,0.8435,167.9066,106.7049,0.8579
-cuda,PLBartForCausalLM,8,1.5261,68.9224,19.5826,0.9887
-cuda,PLBartForConditionalGeneration,4,1.5298,71.2811,45.6902,1.0495
-cuda,PegasusForCausalLM,32,1.2212,57.5436,33.3863,0.9736
-cuda,PegasusForConditionalGeneration,32,1.2822,106.4678,69.8825,1.0689
-cuda,RobertaForCausalLM,16,1.6128,67.5706,34.7355,1.0496
-cuda,RobertaForQuestionAnswering,16,1.6800,53.6267,33.8527,1.1704
-cuda,Speech2Text2ForCausalLM,256,1.8230,32.9145,18.7201,0.8760
-cuda,T5ForConditionalGeneration,4,1.6592,59.5324,39.4406,1.1814
-cuda,T5Small,4,1.6581,59.5930,37.0471,1.1814
-cuda,TrOCRForCausalLM,32,1.2586,106.2633,32.5330,0.9583
-cuda,XLNetLMHeadModel,8,1.8108,142.8795,84.8197,1.1240
-cuda,YituTechConvBert,16,1.5207,81.4595,53.1565,1.0362
--- a/.github/workflows/torchinductor/data/timm_models.csv
+++ b/.github/workflows/torchinductor/data/timm_models.csv
@@ -1,54 +0,0 @@
-dev,name,batch_size,speedup,abs_latency,compilation_latency,compression_ratio
-cuda,adv_inception_v3,128,1.5923,102.5292,51.6032,1.0472
-cuda,beit_base_patch16_224,64,1.3390,75.3027,29.7471,1.0156
-cuda,coat_lite_mini,128,2.0579,53.3689,37.1856,1.0437
-cuda,convmixer_768_32,32,1.0470,275.5328,23.8037,0.9999
-cuda,convnext_base,64,1.5084,80.1811,42.5659,1.0373
-cuda,crossvit_9_240,128,1.5392,37.1806,44.9986,0.9193
-cuda,cspdarknet53,64,1.4721,75.0403,35.2882,1.0547
-cuda,deit_base_distilled_patch16_224,64,1.1432,55.9737,23.4038,0.9816
-cuda,dla102,128,1.5282,123.7284,49.3612,1.0430
-cuda,dm_nfnet_f0,128,1.4354,79.7518,34.8994,1.1038
-cuda,dpn107,32,1.2412,83.8921,58.9111,0.9952
-cuda,eca_botnext26ts_256,128,1.5425,71.2406,28.8920,1.0270
-cuda,ese_vovnet19b_dw,128,1.4647,42.4837,18.0285,1.0135
-cuda,fbnetc_100,128,1.5795,53.8033,33.0222,1.0082
-cuda,gernet_l,128,1.1684,63.4230,26.8687,1.0053
-cuda,ghostnet_100,128,1.7812,54.4211,47.6168,1.0484
-cuda,gluon_inception_v3,128,1.5952,102.5018,50.0857,1.0469
-cuda,gmixer_24_224,128,1.6749,69.2430,42.0841,1.1921
-cuda,gmlp_s16_224,128,1.5886,79.2132,43.0142,1.2343
-cuda,hrnet_w18,128,1.3743,221.5304,134.2573,1.0100
-cuda,inception_v3,128,1.5847,102.8333,49.7648,1.0472
-cuda,jx_nest_base,32,1.3747,71.4190,61.4053,0.9905
-cuda,lcnet_050,128,1.8159,18.0047,18.8249,1.0005
-cuda,mixer_b16_224,128,1.2795,90.9229,21.0438,1.0133
-cuda,mixnet_l,128,1.2273,149.9722,47.7482,1.0129
-cuda,mnasnet_100,128,1.6594,40.0512,26.5165,1.0047
-cuda,mobilenetv2_100,128,1.6085,41.1217,27.4450,1.1731
-cuda,mobilenetv3_large_100,128,1.6610,37.9995,29.8185,1.0052
-cuda,mobilevit_s,64,1.5212,55.4152,53.6475,1.0258
-cuda,nfnet_l0,128,1.4927,65.7078,32.4067,0.9980
-cuda,pit_b_224,64,1.2286,57.9484,26.5321,0.9606
-cuda,pnasnet5large,16,1.0000,198.2494,93.4641,1.3184
-cuda,poolformer_m36,64,1.3486,103.9235,62.3196,1.1942
-cuda,regnety_002,128,1.3030,32.4968,27.2439,1.0014
-cuda,repvgg_a2,128,1.2485,59.7729,26.9209,1.0185
-cuda,res2net101_26w_4s,64,1.0813,94.1773,86.6520,0.9655
-cuda,res2net50_14w_8s,128,1.3251,109.5258,79.9578,0.9830
-cuda,res2next50,128,1.2518,125.5008,43.9754,0.9756
-cuda,resmlp_12_224,128,1.3060,45.2373,19.3709,1.1048
-cuda,resnest101e,64,1.4346,108.1945,78.1993,1.1037
-cuda,rexnet_100,128,1.4637,55.0121,41.2075,1.0862
-cuda,selecsls42b,128,1.4284,44.6645,23.3892,1.0139
-cuda,spnasnet_100,128,1.5908,45.3189,32.0148,1.0048
-cuda,swin_base_patch4_window7_224,64,1.6164,89.5854,75.5848,0.9299
-cuda,swsl_resnext101_32x16d,32,1.0175,110.0041,45.7853,1.0003
-cuda,tf_efficientnet_b0,128,1.5271,55.7361,34.5551,1.1079
-cuda,tf_mixnet_l,128,1.2369,155.9027,48.6695,1.0921
-cuda,tinynet_a,128,1.3792,53.0640,40.6346,1.1108
-cuda,tnt_s_patch16_224,128,3.1078,104.8486,59.6028,1.0660
-cuda,twins_pcpvt_base,64,1.5921,67.4600,84.4977,1.0909
-cuda,visformer_small,128,1.1952,72.8705,23.7303,1.0410
-cuda,vit_base_patch16_224,64,1.1309,56.4866,22.0208,0.9804
-cuda,volo_d1_224,64,1.6868,72.0957,65.3011,0.9729
--- a/.github/workflows/torchinductor/data/torchbench.csv
+++ b/.github/workflows/torchinductor/data/torchbench.csv
@@ -1,53 +0,0 @@
-dev,name,batch_size,speedup,abs_latency,compilation_latency,compression_ratio
-cuda,BERT_pytorch,16,1.7111,24.2741,35.7065,1.3212
-cuda,LearningToPaint,96,1.0513,10.7557,11.1879,0.9896
-cuda,Super_SloMo,6,1.3267,60.4328,28.2097,1.2392
-cuda,alexnet,128,1.1754,8.3246,5.3319,1.0003
-cuda,attention_is_all_you_need_pytorch,256,1.3416,36.4401,39.5927,1.1774
-cuda,dcgan,32,0.9151,2.6249,3.2964,1.0082
-cuda,densenet121,4,0.9225,51.3747,68.5841,0.9930
-cuda,doctr_det_predictor,0,0.0000
-cuda,doctr_reco_predictor,0,0.0000
-cuda,drq,1,0.9500,3.4884,4.8028,0.9687
-cuda,fastNLP_Bert,6,1.4328,34.7753,35.4863,1.2368
-cuda,functorch_dp_cifar10,64,1.2015,8.1625,12.9040,1.0609
-cuda,functorch_maml_omniglot,1,0.9322,2.5844,3.8640,1.0000
-cuda,hf_Albert,8,2.1228,30.3377,26.8282,1.2676
-cuda,hf_Bart,4,1.2899,39.1935,47.2373,1.0080
-cuda,hf_Bert,4,1.3262,26.1063,35.0281,1.0656
-cuda,hf_Bert_large,4,1.4163,55.1021,67.2825,1.0915
-cuda,hf_DistilBert,8,1.4051,21.7191,18.0399,1.0242
-cuda,hf_GPT2,4,1.6661,26.9039,29.9473,1.1555
-cuda,hf_Longformer,0,0.0000
-cuda,hf_Reformer,4,1.1709,64.6979,15.7035,0.9267
-cuda,hf_T5_large,2,1.7215,107.0798,148.8805,1.1684
-cuda,lennard_jones,1000,0.8428,1.8488,3.0609,1.0001
-cuda,maml_omniglot,32,0.9648,2.6869,3.9775,0.9999
-cuda,mnasnet1_0,32,1.0469,21.6251,25.8232,0.9996
-cuda,mobilenet_v2,96,1.5604,31.9572,27.0225,1.1734
-cuda,nvidia_deeprecommender,256,1.0605,9.2080,4.1318,0.9711
-cuda,phlippe_densenet,128,1.0237,27.5988,28.0400,1.0023
-cuda,phlippe_resnet,128,1.0493,10.9751,10.2485,1.0092
-cuda,pytorch_CycleGAN_and_pix2pix,1,1.3724,8.2225,11.9561,1.0219
-cuda,pytorch_stargan,16,1.1835,11.9178,10.0507,1.0868
-cuda,pytorch_unet,1,1.3787,29.7543,13.7711,1.0100
-cuda,resnet152,32,0.9834,63.2446,67.7935,0.9991
-cuda,resnet18,16,0.9451,9.4977,11.7663,0.9948
-cuda,resnet50,32,1.0513,24.5141,24.6629,1.0021
-cuda,resnext50_32x4d,8,0.9216,22.2460,24.3420,0.9984
-cuda,shufflenet_v2_x1_0,128,1.1943,25.4520,28.8611,1.0951
-cuda,soft_actor_critic,256,0.8691,1.9637,3.3716,0.9996
-cuda,speech_transformer,32,1.2718,35.2922,46.9957,1.0897
-cuda,squeezenet1_1,32,1.1302,8.4540,7.9625,1.0771
-cuda,timm_efficientdet,1,1.3370,80.0377,120.1814,1.2713
-cuda,timm_efficientnet,32,1.1874,27.6302,33.9059,1.0971
-cuda,timm_nfnet,128,1.4525,77.3461,34.3270,1.1056
-cuda,timm_regnet,32,1.0644,50.6953,35.7562,1.0000
-cuda,timm_resnest,32,1.6200,14.7763,17.2245,1.0906
-cuda,timm_vision_transformer,32,1.0800,19.4188,22.0255,0.9966
-cuda,timm_vision_transformer_large,32,1.0081,393.1742,127.8083,0.9735
-cuda,timm_vovnet,32,1.1472,22.4727,22.7328,1.0120
-cuda,torchrec_dlrm,0,0.0000
-cuda,tts_angular,64,0.8974,6.5057,2.5555,0.9973
-cuda,vgg16,64,1.2909,50.7405,6.1510,0.9828
-cuda,yolov3,16,1.2930,54.8069,41.9269,1.0563
--- a/.github/workflows/torchinductor/scripts/install_torchinductor.sh
+++ b/.github/workflows/torchinductor/scripts/install_torchinductor.sh
@@ -1,54 +0,0 @@
-#!/bin/bash
-
-# remember where we started
-ROOT="$(pwd)"
-
-# torchinductor venv
-whoami
-python3 -m venv /opt/torchinductor_venv
-# shellcheck source=/dev/null
-source /opt/torchinductor_venv/bin/activate
-# shellcheck source=/dev/null
-source ./.github/workflows/torchinductor/scripts/common.sh
-
-# pytorch nightly
-pip3 install --force-reinstall --pre torch torchtext torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu118
-# pytorch source to get torchbench for dynamo
-cd /opt || exit
-git clone --recursive https://github.com/pytorch/pytorch
-cd pytorch || exit
-# if you are updating an existing checkout
-git submodule sync
-git submodule update --init --recursive
-cd ..
-
-# required packages
-pip3 install expecttest psutil
-
-# torchbench
-pip3 install pyyaml
-git clone https://github.com/pytorch/benchmark.git
-cd benchmark || exit
-python3 install.py
-cd ..
-
-# timm
-git clone https://github.com/huggingface/pytorch-image-models.git
-cd pytorch-image-models || exit
-pip3 install -e .
-cd ..
-
-# build our own triton
-cd "$ROOT" || exit
-cd python || exit
-rm -rf build
-pip3 install -e .
-pip3 uninstall pytorch-triton -y
-
-# clean up cache
-rm -rf /tmp/torchinductor_root/
-rm -rf ~/.triton/cache
-rm -rf "$TEST_REPORTS_DIR"
-
-# go back to where we started
-cd "$ROOT" || exit
--- a/.github/workflows/torchinductor/scripts/run_torchinductor_perf.sh
+++ b/.github/workflows/torchinductor/scripts/run_torchinductor_perf.sh
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-# remember where we started
-ROOT="$(pwd)"
-INDUCTOR="$ROOT"/.github/workflows/torchinductor
-
-# shellcheck source=/dev/null
-source /opt/torchinductor_venv/bin/activate
-# shellcheck source=/dev/null
-source "$INDUCTOR"/scripts/common.sh
-
-# lock GPU clocks to 1350 MHz
-sudo nvidia-smi -i 0 -pm 1
-sudo nvidia-smi -i 0 --lock-gpu-clocks=1350,1350
-
-cd "$PYTORCH_DIR" || exit
-TEST_REPORTS_DIR=$TEST_REPORTS_DIR/perf
-mkdir -p "$TEST_REPORTS_DIR"
-
-for model in "${MODELS[@]}"; do
-  echo "Running performance test for $model"
-  python3 benchmarks/dynamo/"$model".py --ci --training --performance --disable-cudagraphs\
-    --device cuda --inductor --amp --output "$TEST_REPORTS_DIR"/"$model".csv
-done
-
-cd "$ROOT" || exit
-for model in "${MODELS[@]}"; do
-  echo "Checking performance test for $model"
-  python3 "$INDUCTOR"/scripts/check_perf.py --new "$TEST_REPORTS_DIR"/"$model".csv --baseline "$INDUCTOR"/data/"$model".csv
-  EXIT_STATUS=$?
-  if [ "$EXIT_STATUS" -ne 0 ]; then
-    echo "Performance test for $model failed"
-    exit "$EXIT_STATUS"
-  fi
-done
-
-# unlock GPU clocks
-sudo nvidia-smi -i 0 -rgc
-
-# go back to where we started
-cd "$ROOT" || exit
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -8,7 +8,7 @@ jobs:

  Build-Wheels:

-    runs-on: [self-hosted, V100]
+    runs-on: [self-hosted, CPU]
    permissions:
      id-token: write
      contents: read