Merge commit 'cb3d79a185e40c9d8a579bea07747a8a8d157d52' into ifu-231117

Conflicts:
	lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp
	lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp
	lib/Dialect/TritonGPU/IR/Dialect.cpp
	python/setup.py
	python/test/unit/language/assert_helper.py
	python/test/unit/operators/test_flash_attention.py
	python/test/unit/runtime/test_subproc.py
	python/triton/compiler/compiler.py
	python/triton/language/semantic.py
	python/triton/runtime/autotuner.py
	python/triton/runtime/jit.py
	python/tutorials/03-matrix-multiplication.py
	python/tutorials/05-layer-norm.py
	python/tutorials/06-fused-attention.py
	python/tutorials/11-grouped-gemm.py
	test/Conversion/tritongpu_to_llvm.mlir
This commit is contained in:
Jason Furmanek
2023-11-17 20:42:12 +00:00
179 changed files with 10116 additions and 6835 deletions

View File

@@ -34,7 +34,7 @@ jobs:
fi
Integration-Tests-Nvidia:
Integration-Tests:
needs: Runner-Preparation
runs-on: ${{ matrix.runner }}
@@ -73,10 +73,10 @@ jobs:
run: |
cd python
python3 -m pip install --upgrade pip
python3 -m pip install cmake==3.24
python3 -m pip install ninja
python3 -m pip install --no-build-isolation -vvv '.[tests]'
python3 -m pip install pytest-xdist
python3 -m pip install cmake==3.24 ninja pytest-xdist
sudo apt-get update -y
sudo apt-get install -y ccache clang lld
TRITON_BUILD_WITH_CLANG_LLD=true TRITON_BUILD_WITH_CCACHE=true python3 -m pip install --no-build-isolation -vvv '.[tests]'
- name: Run lit tests
if: ${{ env.BACKEND == 'CUDA'}}
@@ -171,140 +171,8 @@ jobs:
python3 -m pytest -vs . --reruns 10
sudo nvidia-smi -i 0 -rgc
Integration-Tests-Shared-Middle-Layer:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Clear cache
run: |
rm -rf ~/.triton
- name: Update PATH
run: |
echo "PATH=${HOME}/.local/bin:${PATH}" >> "${GITHUB_ENV}"
- name: Check pre-commit
run: |
python3 -m pip install --upgrade pre-commit
python3 -m pre_commit run --all-files --verbose
- name: Install Triton
run: |
export TRITON_CODEGEN_TRITON_SHARED=1
git submodule update --init --recursive
cd python
python3 -m pip install --upgrade pip
python3 -m pip install cmake==3.24
python3 -m pip install ninja
python3 -m pip uninstall -y triton
python3 setup.py build
python3 -m pip install --no-build-isolation -vvv '.[tests]'
- name: Run shared middle-layer lit tests
run: |
python3 -m pip install lit
cd python
LIT_TEST_DIR="build/$(ls build | grep -i cmake)/third_party/triton_shared/test"
if [ ! -d "${LIT_TEST_DIR}" ]; then
echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
fi
lit -v "${LIT_TEST_DIR}"
Integration-Tests-Third-Party:
needs: Runner-Preparation
if: false
runs-on: ${{ matrix.runner }}
strategy:
matrix:
runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-optional)}}
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Set ROCM ENV
if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'gfx908')}}
run: |
echo "BACKEND=ROCM" >> "${GITHUB_ENV}"
- name: Set XPU ENV
if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'arc770')}}
run: |
echo "BACKEND=XPU" >> "${GITHUB_ENV}"
- name: Clear cache
run: |
rm -rf ~/.triton
- name: Update PATH
run: |
echo "PATH=${HOME}/.local/bin:${PATH}" >> "${GITHUB_ENV}"
- name: Check pre-commit
if: ${{ matrix.runner != 'macos-10.15' && (matrix.runner[1] != 'arc770') }}
run: |
python3 -m pip install --upgrade pre-commit
python3 -m pre_commit run --all-files --verbose
- name: Check pre-commit arc770
if: ${{ matrix.runner != 'macos-10.15' && (matrix.runner[1] == 'arc770') }}
run: |
source ${HOME}/triton_vars.sh
source ${HOME}/miniconda3/bin/activate
conda activate triton-xpu-ci
python3 -m pip install --upgrade pre-commit
python3 -m pre_commit run --all-files
- name: Install Triton on ROCM
if: ${{ env.BACKEND == 'ROCM'}}
run: |
git submodule update --init --recursive
cd python
python3 -m pip install --upgrade pip
python3 -m pip install cmake==3.24
python3 -m pip install torch==1.13.1 --index-url https://download.pytorch.org/whl/rocm5.2
export TRITON_CODEGEN_AMD_HIP_BACKEND=1
python3 -m pip install --no-build-isolation -vvv '.[tests]'
- name: Install Triton on XPU
if: ${{ env.BACKEND == 'XPU'}}
run: |
source ${HOME}/triton_vars.sh
source ${HOME}/miniconda3/bin/activate
conda activate triton-xpu-ci
git submodule update --init --recursive
cd python
python3 -m pip install --upgrade pip
python3 -m pip install cmake==3.24
export TRITON_CODEGEN_INTEL_XPU_BACKEND=1
python3 -m pip uninstall -y triton
python3 setup.py build
python3 -m pip install --no-build-isolation -vvv '.[tests]'
- name: Run python tests on ROCM
if: ${{ env.BACKEND == 'ROCM'}}
run: |
cd python/test/unit/language
python3 -m pytest --capture=tee-sys -rfs --verbose "test_core.py"
- name: Run python tests on XPU
if: ${{ env.BACKEND == 'XPU'}}
run: |
source ${HOME}/triton_vars.sh
source ${HOME}/miniconda3/bin/activate
conda activate triton-xpu-ci
cd python/test/backend/third_party_backends
python3 -m pytest --capture=tee-sys -rfs --verbose --backend xpu
Compare-artifacts:
needs: Integration-Tests-Nvidia
needs: Integration-Tests
runs-on: ubuntu-latest
@@ -333,7 +201,7 @@ jobs:
- name: Download latest main artifacts
env:
ARTIFACT_NAME: artifacts A100
ARTIFACT_JOB_NAME: Integration-Tests-Nvidia
ARTIFACT_JOB_NAME: Integration-Tests
MAX_NUM_ACTIONS_PAGES: 30
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |

View File

@@ -18,43 +18,17 @@ permissions:
jobs:
build:
name: Build on ${{ matrix.config.runner }}
runs-on: ${{ matrix.config.runs_on }}
strategy:
fail-fast: true
matrix:
platform: [
ubuntu-20.04-x64,
ubuntu-22.04-x64,
centos-7-x64,
macos-x64,
macos-arm64
]
include:
# Specify OS versions
- platform: ubuntu-20.04-x64
host-os: ubuntu-20.04
target-os: ubuntu
arch: x64
- platform: ubuntu-22.04-x64
host-os: ubuntu-22.04
target-os: ubuntu
arch: x64
- platform: centos-7-x64
host-os: ubuntu-22.04
target-os: centos
arch: x64
- platform: macos-x64
host-os: macos-12
target-os: macos
arch: x64
- platform: macos-arm64
host-os: macos-12
target-os: macos
arch: arm64
runs-on: ${{ matrix.host-os }}
config:
- {runner: 'Ubuntu 20.04', runs_on: 'ubuntu-20.04', target-os: 'ubuntu', arch: 'x64'}
- {runner: 'CentOS 7', runs_on: ['self-hosted', 'CPU'], target-os: 'centos', arch: 'x64'}
- {runner: 'MacOS X64', runs_on: 'macos-12', target-os: 'macos', arch: 'x64'}
- {runner: 'MacOS ARM64', runs_on: 'macos-12', target-os: 'macos', arch: 'arm64'}
steps:
@@ -73,7 +47,7 @@ jobs:
echo "Short LLVM commit hash: ${SHORT_LLVM_COMMIT_HASH}"
echo "short_llvm_commit_hash=${SHORT_LLVM_COMMIT_HASH}" >> ${GITHUB_ENV}
INSTALL_DIR="llvm-${SHORT_LLVM_COMMIT_HASH}-${{ matrix.platform }}"
INSTALL_DIR="llvm-${SHORT_LLVM_COMMIT_HASH}-${{ matrix.config.target-os }}-${{ matrix.config.arch }}"
echo "LLVM installation directory name: ${INSTALL_DIR}"
echo "llvm_install_dir=${INSTALL_DIR}" >> ${GITHUB_ENV}
@@ -99,11 +73,11 @@ jobs:
uses: actions/cache@v3
with:
path: ${{ env.SCCACHE_DIR }}
key: ${{ matrix.platform }}-${{ env.short_llvm_commit_hash }}
restore-keys: ${{ matrix.platform }}-
key: ${{ matrix.config.target-os }}-${{ matrix.config.arch }}-${{ env.short_llvm_commit_hash }}
restore-keys: ${{ matrix.config.target-os }}-${{ matrix.config.arch }}-
- name: Configure, Build, Test, and Install LLVM (Ubuntu and macOS x64)
if: matrix.arch == 'x64' && contains(fromJSON('["ubuntu", "macos"]'), matrix.target-os)
if: matrix.config.arch == 'x64' && (matrix.config.target-os == 'ubuntu' || matrix.config.target-os == 'macos')
run: >
python3 -m pip install -r llvm-project/mlir/python/requirements.txt
@@ -114,11 +88,13 @@ jobs:
-DCMAKE_INSTALL_PREFIX="${{ env.llvm_install_dir }}"
-DCMAKE_LINKER=lld
-DLLVM_BUILD_UTILS=ON
-DLLVM_BUILD_TOOLS=ON
-DLLVM_ENABLE_ASSERTIONS=ON
-DMLIR_ENABLE_BINDINGS_PYTHON=ON
-DLLVM_ENABLE_PROJECTS=mlir
-DLLVM_INSTALL_UTILS=ON
-DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
-DLLVM_ENABLE_TERMINFO=OFF
llvm-project/llvm
ninja -C llvm-project/build check-mlir install
@@ -126,7 +102,7 @@ jobs:
tar czf "${{ env.llvm_install_dir }}.tar.gz" "${{ env.llvm_install_dir }}"
- name: Configure, Build, and Install LLVM (macOS arm64)
if: matrix.arch == 'arm64' && matrix.target-os == 'macos'
if: matrix.config.arch == 'arm64' && matrix.config.target-os == 'macos'
run: >
python3 -m pip install -r llvm-project/mlir/python/requirements.txt
@@ -138,6 +114,7 @@ jobs:
-DCMAKE_LINKER=lld
-DCMAKE_OSX_ARCHITECTURES=arm64
-DLLVM_BUILD_UTILS=ON
-DLLVM_BUILD_TOOLS=ON
-DLLVM_ENABLE_ASSERTIONS=ON
-DMLIR_ENABLE_BINDINGS_PYTHON=ON
-DLLVM_ENABLE_PROJECTS=mlir
@@ -145,6 +122,7 @@ jobs:
-DLLVM_INSTALL_UTILS=ON
-DLLVM_TARGETS_TO_BUILD="AArch64"
-DLLVM_USE_HOST_TOOLS=ON
-DLLVM_ENABLE_TERMINFO=OFF
llvm-project/llvm
ninja -C llvm-project/build install
@@ -152,10 +130,10 @@ jobs:
tar czf "${{ env.llvm_install_dir }}.tar.gz" "${{ env.llvm_install_dir }}"
- name: Configure, Build, Test, and Install LLVM (CentOS)
if: matrix.target-os == 'centos'
if: matrix.config.target-os == 'centos'
run: |
docker build --tag llvm-build --build-arg llvm_dir=llvm-project \
-f llvm-build/.github/workflows/Dockerfile .
-f llvm-build/.github/workflows/llvm-build/Dockerfile .
# Create temporary container to copy cache and installed artifacts.
CONTAINER_ID=$(docker create llvm-build)

View File

@@ -1,16 +1,17 @@
FROM centos:7
ARG llvm_dir=llvm-project
# Add the cache artifacts and the LLVM source tree to the container
ADD sccache /sccache
ADD "${llvm_dir}" /source/llvm-project
ENV SCCACHE_DIR="/sccache"
ENV SCCACHE_CACHE_SIZE="2G"
RUN echo -e "[llvmtoolset-build]\nname=LLVM Toolset 13.0 - Build\nbaseurl=https://buildlogs.centos.org/c7-llvm-toolset-13.0.x86_64/\ngpgcheck=0\nenabled=1" > /etc/yum.repos.d/llvmtoolset-build.repo
# Install build dependencies
RUN yum install --assumeyes centos-release-scl
RUN yum install --assumeyes devtoolset-9-gcc* python3-devel python3-pip
SHELL [ "/usr/bin/scl", "enable", "devtoolset-9" ]
RUN yum install --assumeyes --nogpgcheck llvm-toolset-13.0
RUN yum install --assumeyes rh-python38-python-devel rh-python38-python-pip
SHELL [ "/usr/bin/scl", "enable", "llvm-toolset-13.0", "rh-python38" ]
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --upgrade cmake ninja sccache
@@ -21,17 +22,22 @@ RUN python3 -m pip install -r /source/llvm-project/mlir/python/requirements.txt
# Configure, Build, Test, and Install LLVM
RUN cmake -GNinja -Bbuild \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_C_COMPILER=gcc \
-DCMAKE_CXX_COMPILER=g++ \
-DCMAKE_C_COMPILER=clang \
-DCMAKE_CXX_COMPILER=clang++ \
-DCMAKE_ASM_COMPILER=clang \
-DCMAKE_C_COMPILER_LAUNCHER=sccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=sccache \
-DCMAKE_CXX_FLAGS="-Wno-everything" \
-DCMAKE_LINKER=lld \
-DCMAKE_INSTALL_PREFIX="/install" \
-DLLVM_BUILD_UTILS=ON \
-DLLVM_BUILD_TOOLS=ON \
-DLLVM_ENABLE_ASSERTIONS=ON \
-DMLIR_ENABLE_BINDINGS_PYTHON=ON \
-DLLVM_ENABLE_PROJECTS=mlir \
-DLLVM_ENABLE_TERMINFO=OFF \
-DLLVM_INSTALL_UTILS=ON \
-DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" \
/source/llvm-project/llvm
RUN ninja -C build check-mlir install
RUN ninja -C build install

View File

@@ -0,0 +1,167 @@
name: Integration Tests
on:
workflow_dispatch:
pull_request:
branches: [main]
merge_group:
branches: [main]
types: [checks_requested]
concurrency:
group: ${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/master' }}
env:
TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
jobs:
Runner-Preparation:
runs-on: ubuntu-latest
outputs:
matrix-required: ${{ steps.set-matrix.outputs.matrix-required }}
matrix-optional: ${{ steps.set-matrix.outputs.matrix-optional }}
steps:
- name: Prepare runner matrix
id: set-matrix
run: |
if [ x"${{ github.repository }}" == x"openai/triton" ]; then
echo '::set-output name=matrix-required::[["self-hosted", "A100"], ["self-hosted", "H100"]]'
echo '::set-output name=matrix-optional::[["self-hosted", "gfx908"], ["self-hosted", "arc770"]]'
else
echo '::set-output name=matrix-required::["ubuntu-latest"]'
echo '::set-output name=matrix-optional::["ubuntu-latest"]'
fi
Integration-Tests-Shared-Middle-Layer:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Clear cache
run: |
rm -rf ~/.triton
- name: Update PATH
run: |
echo "PATH=${HOME}/.local/bin:${PATH}" >> "${GITHUB_ENV}"
- name: Check pre-commit
run: |
python3 -m pip install --upgrade pre-commit
python3 -m pre_commit run --all-files --verbose
- name: Install Triton
run: |
export TRITON_CODEGEN_TRITON_SHARED=1
git submodule update --init --recursive
cd python
python3 -m pip install --upgrade pip
python3 -m pip install cmake==3.24
python3 -m pip install ninja
python3 -m pip uninstall -y triton
python3 setup.py build
python3 -m pip install --no-build-isolation -vvv '.[tests]'
- name: Run shared middle-layer lit tests
run: |
python3 -m pip install lit
cd python
LIT_TEST_DIR="build/$(ls build | grep -i cmake)/third_party/triton_shared/test"
if [ ! -d "${LIT_TEST_DIR}" ]; then
echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
fi
lit -v "${LIT_TEST_DIR}"
Integration-Tests-Third-Party:
needs: Runner-Preparation
if: false
runs-on: ${{ matrix.runner }}
strategy:
matrix:
runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-optional)}}
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Set ROCM ENV
if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'gfx908')}}
run: |
echo "BACKEND=ROCM" >> "${GITHUB_ENV}"
- name: Set XPU ENV
if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'arc770')}}
run: |
echo "BACKEND=XPU" >> "${GITHUB_ENV}"
- name: Clear cache
run: |
rm -rf ~/.triton
- name: Update PATH
run: |
echo "PATH=${HOME}/.local/bin:${PATH}" >> "${GITHUB_ENV}"
- name: Check pre-commit
if: ${{ matrix.runner != 'macos-10.15' && (matrix.runner[1] != 'arc770') }}
run: |
python3 -m pip install --upgrade pre-commit
python3 -m pre_commit run --all-files --verbose
- name: Check pre-commit arc770
if: ${{ matrix.runner != 'macos-10.15' && (matrix.runner[1] == 'arc770') }}
run: |
source ${HOME}/triton_vars.sh
source ${HOME}/miniconda3/bin/activate
conda activate triton-xpu-ci
python3 -m pip install --upgrade pre-commit
python3 -m pre_commit run --all-files
- name: Install Triton on ROCM
if: ${{ env.BACKEND == 'ROCM'}}
run: |
git submodule update --init --recursive
cd python
python3 -m pip install --upgrade pip
python3 -m pip install cmake==3.24
python3 -m pip install torch==1.13.1 --index-url https://download.pytorch.org/whl/rocm5.2
export TRITON_CODEGEN_AMD_HIP_BACKEND=1
python3 -m pip install --no-build-isolation -vvv '.[tests]'
- name: Install Triton on XPU
if: ${{ env.BACKEND == 'XPU'}}
run: |
source ${HOME}/triton_vars.sh
source ${HOME}/miniconda3/bin/activate
conda activate triton-xpu-ci
git submodule update --init --recursive
cd python
python3 -m pip install --upgrade pip
python3 -m pip install cmake==3.24
export TRITON_CODEGEN_INTEL_XPU_BACKEND=1
python3 -m pip uninstall -y triton
python3 setup.py build
python3 -m pip install --no-build-isolation -vvv '.[tests]'
- name: Run python tests on ROCM
if: ${{ env.BACKEND == 'ROCM'}}
run: |
cd python/test/unit/language
python3 -m pytest --capture=tee-sys -rfs --verbose "test_core.py"
- name: Run python tests on XPU
if: ${{ env.BACKEND == 'XPU'}}
run: |
source ${HOME}/triton_vars.sh
source ${HOME}/miniconda3/bin/activate
conda activate triton-xpu-ci
cd python/test/backend/third_party_backends
python3 -m pytest --capture=tee-sys -rfs --verbose --backend xpu

View File

@@ -1,7 +1,9 @@
name: Torchinductor
on:
workflow_dispatch:
workflow_run:
workflows: ["Wheel"]
types: [completed]
jobs:
Runner-Preparation:
@@ -23,17 +25,17 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v2
#- name: Packages
# run: |
# ./.github/workflows/torchinductor/scripts/install_torchinductor.sh
- name: Packages
run: |
./.github/workflows/torch-inductor/scripts/install_torchinductor.sh torchbench
- name: Environment
run: |
source /opt/torchinductor_venv/bin/activate
./.github/workflows/torchinductor/scripts/install_triton.sh
./.github/workflows/torch-inductor/scripts/install_triton.sh
- name: Performance
run: |
./.github/workflows/torchinductor/scripts/run_torchinductor_perf.sh
./.github/workflows/torch-inductor/scripts/run_torchinductor_perf.sh torchbench
# Runs too long time
#- name: Accuracy
# run: |
# ./.github/workflows/torchinductor/scripts/run_torchinductor_acc.sh
# ./.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh

View File

@@ -3,8 +3,7 @@ import csv
from collections import namedtuple
# Create a named tuple for the output of the benchmark
BenchmarkOutput = namedtuple(
'BenchmarkOutput', ['dev', 'name', 'batch_size', 'speedup', 'latency'])
BenchmarkOutput = namedtuple('BenchmarkOutput', ['dev', 'name', 'batch_size', 'speedup', 'latency'])
def parse_output(file_path: str) -> dict:
@@ -19,13 +18,11 @@ def parse_output(file_path: str) -> dict:
batch_size = row[2]
speedup = float(row[3])
latency = float(row[4])
entries[name] = BenchmarkOutput(
dev, name, batch_size, speedup, latency)
entries[name] = BenchmarkOutput(dev, name, batch_size, speedup, latency)
return entries
def compare(baseline: dict, new: dict, threshold: float,
geomean_threshold: float) -> bool:
def compare(baseline: dict, new: dict, threshold: float, geomean_threshold: float) -> bool:
baseline_geomean = 1.0
new_geomean = 1.0
for key in new:
@@ -33,20 +30,27 @@ def compare(baseline: dict, new: dict, threshold: float,
print(f"New benchmark {key} not found in baseline")
baseline_latency = baseline[key].latency
new_latency = new[key].latency
if baseline_latency == 0:
print(f"Baseline latency for {key} is 0")
continue
elif new_latency == 0:
print(f"New latency for {key} is 0")
continue
if new_latency < baseline_latency * (1 - threshold):
print(
f"New benchmark {key} is faster than baseline: {new_latency} vs {baseline_latency}")
print(f"New benchmark {key} is faster than baseline: {new_latency} vs {baseline_latency}")
elif new_latency > baseline_latency * (1 + threshold):
print(
f"New benchmark {key} is slower than baseline: {new_latency} vs {baseline_latency}")
print(f"New benchmark {key} is slower than baseline: {new_latency} vs {baseline_latency}")
else:
print(f"New benchmark {key} is within threshold: {new_latency} vs {baseline_latency}")
baseline_geomean *= baseline[key].speedup
new_geomean *= new[key].speedup
baseline_geomean = baseline_geomean ** (1 / len(baseline))
new_geomean = new_geomean ** (1 / len(new))
baseline_geomean = baseline_geomean**(1 / len(baseline))
new_geomean = new_geomean**(1 / len(new))
print(f"Baseline geomean: {baseline_geomean}")
print(f"New geomean: {new_geomean}")
assert new_geomean > baseline_geomean * (1 - geomean_threshold), \
assert new_geomean >= baseline_geomean * (1 - geomean_threshold), \
f"New geomean is slower than baseline: {new_geomean} vs {baseline_geomean}"

View File

@@ -0,0 +1,70 @@
#!/bin/bash
# remember where we started
ROOT="$(pwd)"
MODEL_SPEC=$1
# torchinductor venv
whoami
# clean up old venv
rm -rf /opt/torchinductor_venv
python3 -m venv /opt/torchinductor_venv
# shellcheck source=/dev/null
source /opt/torchinductor_venv/bin/activate
# shellcheck source=/dev/null
source ./.github/workflows/torch-inductor/scripts/common.sh
# pytorch nightly
pip3 install --force-reinstall --pre torch torchtext torchvision torchaudio torchrec --extra-index-url https://download.pytorch.org/whl/nightly/cu121
# pytorch source to get torchbench for dynamo
cd /opt || exit
# cleanup old pytorch
rm -rf pytorch
git clone --recursive https://github.com/pytorch/pytorch
cd pytorch || exit
# if you are updating an existing checkout
git submodule sync
git submodule update --init --recursive
cd ..
# required packages
# https://github.com/pytorch/benchmark/blob/main/docker/gcp-a100-runner-dind.dockerfile#L17
sudo apt-get install --yes libpango-1.0-0 libpangoft2-1.0-0
pip3 install --upgrade pip
pip3 install expecttest psutil lightning-utilities pyre_extensions
# torchbench
if [ "$MODEL_SPEC" == "torchbench" ] || [ "$MODEL_SPEC" != "all" ]; then
# clean up old torchbench
rm -rf benchmark
pip3 install pyyaml
git clone https://github.com/pytorch/benchmark.git
cd benchmark || exit
python3 install.py
cd ..
fi
# timm
if [ "$MODEL_SPEC" == "timm_models" ] || [ "$MODEL_SPEC" != "all" ]; then
# clean up old timm
rm -rf pytorch-image-models
git clone https://github.com/huggingface/pytorch-image-models.git
cd pytorch-image-models || exit
pip3 install -e .
cd ..
fi
# build our own triton
cd "$ROOT" || exit
cd python || exit
rm -rf build
pip3 install -e .
pip3 uninstall pytorch-triton -y
# clean up cache
rm -rf /tmp/torchinductor_root/
rm -rf ~/.triton/cache
rm -rf "$TEST_REPORTS_DIR"
# go back to where we started
cd "$ROOT" || exit

View File

@@ -6,7 +6,7 @@ ROOT="$(pwd)"
# shellcheck source=/dev/null
source /opt/torchinductor_venv/bin/activate
# shellcheck source=/dev/null
source ./.github/workflows/torchinductor/scripts/common.sh
source ./.github/workflows/torch-inductor/scripts/common.sh
# build our own triton
cd python || exit

View File

@@ -2,7 +2,8 @@
# remember where we started
ROOT="$(pwd)"
INDUCTOR="$ROOT"/.github/workflows/torchinductor
INDUCTOR="$ROOT"/.github/workflows/torch-inductor
MODEL_SPEC=$1
# shellcheck source=/dev/null
source /opt/torchinductor_venv/bin/activate
@@ -14,6 +15,9 @@ TEST_REPORTS_DIR=$TEST_REPORTS_DIR/acc
mkdir -p "$TEST_REPORTS_DIR"
for model in "${MODELS[@]}"; do
if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
continue
fi
echo "Running accuracy test for $model"
python3 benchmarks/dynamo/"$model".py --ci --accuracy --timing --explain --inductor --device cuda \
--output "$TEST_REPORTS_DIR"/inference_"$model".csv
@@ -25,6 +29,9 @@ done
cd "$ROOT" || exit
for model in "${MODELS[@]}"; do
if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
continue
fi
echo "Checking accuracy test for $model"
python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/inference_"$model".csv
python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/training_"$model".csv

View File

@@ -0,0 +1,69 @@
#!/bin/bash
# remember where we started
ROOT="$(pwd)"
INDUCTOR="$ROOT"/.github/workflows/torch-inductor
MODEL_SPEC=$1
# shellcheck source=/dev/null
source /opt/torchinductor_venv/bin/activate
# shellcheck source=/dev/null
source "$INDUCTOR"/scripts/common.sh
# lock GPU clocks to 1350 MHz
sudo nvidia-smi -i 0 -pm 1
sudo nvidia-smi -i 0 --lock-gpu-clocks=1350,1350
cd "$PYTORCH_DIR" || exit
TRITON_TEST_REPORTS_DIR=$TEST_REPORTS_DIR/perf
BASE_TEST_REPORTS_DIR=$TEST_REPORTS_DIR/acc
mkdir -p "$TRITON_TEST_REPORTS_DIR"
mkdir -p "$BASE_TEST_REPORTS_DIR"
echo "Running with Triton Nightly"
for model in "${MODELS[@]}"; do
if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
continue
fi
echo "Running performance test for $model"
python3 benchmarks/dynamo/"$model".py --float32 -dcuda --training --inductor --performance \
--output "$TRITON_TEST_REPORTS_DIR"/"$model".csv
done
# install pytorch-triton
pip3 uninstall triton -y
pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu121
echo "Running with pytorch-triton"
for model in "${MODELS[@]}"; do
if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
continue
fi
echo "Running performance test for $model"
python3 benchmarks/dynamo/"$model".py --float32 -dcuda --training --inductor --performance \
--output "$BASE_TEST_REPORTS_DIR"/"$model".csv
done
# uninstall pytorch-triton
pip3 uninstall pytorch-triton -y
cd "$ROOT" || exit
for model in "${MODELS[@]}"; do
if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
continue
fi
echo "Checking performance test for $model"
python3 "$INDUCTOR"/scripts/check_perf.py --new "$TRITON_TEST_REPORTS_DIR"/"$model".csv --baseline "$BASE_TEST_REPORTS_DIR"/"$model".csv
EXIT_STATUS=$?
if [ "$EXIT_STATUS" -ne 0 ]; then
echo "Performance test for $model failed"
exit "$EXIT_STATUS"
fi
done
# unlock GPU clocks
sudo nvidia-smi -i 0 -rgc
# go back to where we started
cd "$ROOT" || exit

View File

@@ -1,37 +0,0 @@
dev,name,batch_size,speedup,abs_latency,compilation_latency,compression_ratio
cuda,AlbertForMaskedLM,4,1.5511,164.3373,26.8523,1.2647
cuda,AlbertForQuestionAnswering,4,1.5501,163.5580,25.7983,1.3145
cuda,BartForCausalLM,4,1.5080,71.7230,32.8907,0.9749
cuda,BertForMaskedLM,16,1.5350,67.9451,35.3286,1.0494
cuda,BertForQuestionAnswering,16,1.6735,53.2963,34.3754,1.1710
cuda,BlenderbotSmallForCausalLM,64,1.2106,46.6466,23.8058,0.9120
cuda,BlenderbotSmallForConditionalGeneration,64,1.3616,77.3013,55.3546,0.9803
cuda,CamemBert,16,1.4779,76.1809,35.3883,1.0469
cuda,DebertaForMaskedLM,4,0.8415,62.3395,35.9657,1.0418
cuda,DebertaForQuestionAnswering,8,1.0609,67.5151,35.7728,1.1528
cuda,DebertaV2ForMaskedLM,1,0.6026,134.6517,66.1783,0.9773
cuda,DistilBertForMaskedLM,128,1.2460,66.9382,18.3089,0.9624
cuda,DistilBertForQuestionAnswering,256,1.3997,72.4126,18.1956,1.1486
cuda,DistillGPT2,16,1.6656,60.5455,17.2280,1.0641
cuda,ElectraForCausalLM,32,1.8299,45.4841,37.0944,0.9717
cuda,ElectraForQuestionAnswering,64,2.0289,52.6890,35.9632,1.1928
cuda,GPT2ForSequenceClassification,4,2.2567,38.2969,30.0527,1.2323
cuda,LayoutLMForMaskedLM,16,1.5423,68.8018,36.5562,1.0495
cuda,LayoutLMForSequenceClassification,16,1.7058,53.9355,35.2225,1.1659
cuda,MBartForCausalLM,4,1.4945,71.4649,32.8653,0.9830
cuda,MegatronBertForCausalLM,4,1.4328,58.4404,70.6226,1.0951
cuda,MegatronBertForQuestionAnswering,8,1.5886,85.2533,69.1219,1.1152
cuda,MobileBertForMaskedLM,64,0.9007,131.7379,107.5275,1.0136
cuda,MobileBertForQuestionAnswering,128,0.8435,167.9066,106.7049,0.8579
cuda,PLBartForCausalLM,8,1.5261,68.9224,19.5826,0.9887
cuda,PLBartForConditionalGeneration,4,1.5298,71.2811,45.6902,1.0495
cuda,PegasusForCausalLM,32,1.2212,57.5436,33.3863,0.9736
cuda,PegasusForConditionalGeneration,32,1.2822,106.4678,69.8825,1.0689
cuda,RobertaForCausalLM,16,1.6128,67.5706,34.7355,1.0496
cuda,RobertaForQuestionAnswering,16,1.6800,53.6267,33.8527,1.1704
cuda,Speech2Text2ForCausalLM,256,1.8230,32.9145,18.7201,0.8760
cuda,T5ForConditionalGeneration,4,1.6592,59.5324,39.4406,1.1814
cuda,T5Small,4,1.6581,59.5930,37.0471,1.1814
cuda,TrOCRForCausalLM,32,1.2586,106.2633,32.5330,0.9583
cuda,XLNetLMHeadModel,8,1.8108,142.8795,84.8197,1.1240
cuda,YituTechConvBert,16,1.5207,81.4595,53.1565,1.0362
1 dev name batch_size speedup abs_latency compilation_latency compression_ratio
2 cuda AlbertForMaskedLM 4 1.5511 164.3373 26.8523 1.2647
3 cuda AlbertForQuestionAnswering 4 1.5501 163.5580 25.7983 1.3145
4 cuda BartForCausalLM 4 1.5080 71.7230 32.8907 0.9749
5 cuda BertForMaskedLM 16 1.5350 67.9451 35.3286 1.0494
6 cuda BertForQuestionAnswering 16 1.6735 53.2963 34.3754 1.1710
7 cuda BlenderbotSmallForCausalLM 64 1.2106 46.6466 23.8058 0.9120
8 cuda BlenderbotSmallForConditionalGeneration 64 1.3616 77.3013 55.3546 0.9803
9 cuda CamemBert 16 1.4779 76.1809 35.3883 1.0469
10 cuda DebertaForMaskedLM 4 0.8415 62.3395 35.9657 1.0418
11 cuda DebertaForQuestionAnswering 8 1.0609 67.5151 35.7728 1.1528
12 cuda DebertaV2ForMaskedLM 1 0.6026 134.6517 66.1783 0.9773
13 cuda DistilBertForMaskedLM 128 1.2460 66.9382 18.3089 0.9624
14 cuda DistilBertForQuestionAnswering 256 1.3997 72.4126 18.1956 1.1486
15 cuda DistillGPT2 16 1.6656 60.5455 17.2280 1.0641
16 cuda ElectraForCausalLM 32 1.8299 45.4841 37.0944 0.9717
17 cuda ElectraForQuestionAnswering 64 2.0289 52.6890 35.9632 1.1928
18 cuda GPT2ForSequenceClassification 4 2.2567 38.2969 30.0527 1.2323
19 cuda LayoutLMForMaskedLM 16 1.5423 68.8018 36.5562 1.0495
20 cuda LayoutLMForSequenceClassification 16 1.7058 53.9355 35.2225 1.1659
21 cuda MBartForCausalLM 4 1.4945 71.4649 32.8653 0.9830
22 cuda MegatronBertForCausalLM 4 1.4328 58.4404 70.6226 1.0951
23 cuda MegatronBertForQuestionAnswering 8 1.5886 85.2533 69.1219 1.1152
24 cuda MobileBertForMaskedLM 64 0.9007 131.7379 107.5275 1.0136
25 cuda MobileBertForQuestionAnswering 128 0.8435 167.9066 106.7049 0.8579
26 cuda PLBartForCausalLM 8 1.5261 68.9224 19.5826 0.9887
27 cuda PLBartForConditionalGeneration 4 1.5298 71.2811 45.6902 1.0495
28 cuda PegasusForCausalLM 32 1.2212 57.5436 33.3863 0.9736
29 cuda PegasusForConditionalGeneration 32 1.2822 106.4678 69.8825 1.0689
30 cuda RobertaForCausalLM 16 1.6128 67.5706 34.7355 1.0496
31 cuda RobertaForQuestionAnswering 16 1.6800 53.6267 33.8527 1.1704
32 cuda Speech2Text2ForCausalLM 256 1.8230 32.9145 18.7201 0.8760
33 cuda T5ForConditionalGeneration 4 1.6592 59.5324 39.4406 1.1814
34 cuda T5Small 4 1.6581 59.5930 37.0471 1.1814
35 cuda TrOCRForCausalLM 32 1.2586 106.2633 32.5330 0.9583
36 cuda XLNetLMHeadModel 8 1.8108 142.8795 84.8197 1.1240
37 cuda YituTechConvBert 16 1.5207 81.4595 53.1565 1.0362

View File

@@ -1,54 +0,0 @@
dev,name,batch_size,speedup,abs_latency,compilation_latency,compression_ratio
cuda,adv_inception_v3,128,1.5923,102.5292,51.6032,1.0472
cuda,beit_base_patch16_224,64,1.3390,75.3027,29.7471,1.0156
cuda,coat_lite_mini,128,2.0579,53.3689,37.1856,1.0437
cuda,convmixer_768_32,32,1.0470,275.5328,23.8037,0.9999
cuda,convnext_base,64,1.5084,80.1811,42.5659,1.0373
cuda,crossvit_9_240,128,1.5392,37.1806,44.9986,0.9193
cuda,cspdarknet53,64,1.4721,75.0403,35.2882,1.0547
cuda,deit_base_distilled_patch16_224,64,1.1432,55.9737,23.4038,0.9816
cuda,dla102,128,1.5282,123.7284,49.3612,1.0430
cuda,dm_nfnet_f0,128,1.4354,79.7518,34.8994,1.1038
cuda,dpn107,32,1.2412,83.8921,58.9111,0.9952
cuda,eca_botnext26ts_256,128,1.5425,71.2406,28.8920,1.0270
cuda,ese_vovnet19b_dw,128,1.4647,42.4837,18.0285,1.0135
cuda,fbnetc_100,128,1.5795,53.8033,33.0222,1.0082
cuda,gernet_l,128,1.1684,63.4230,26.8687,1.0053
cuda,ghostnet_100,128,1.7812,54.4211,47.6168,1.0484
cuda,gluon_inception_v3,128,1.5952,102.5018,50.0857,1.0469
cuda,gmixer_24_224,128,1.6749,69.2430,42.0841,1.1921
cuda,gmlp_s16_224,128,1.5886,79.2132,43.0142,1.2343
cuda,hrnet_w18,128,1.3743,221.5304,134.2573,1.0100
cuda,inception_v3,128,1.5847,102.8333,49.7648,1.0472
cuda,jx_nest_base,32,1.3747,71.4190,61.4053,0.9905
cuda,lcnet_050,128,1.8159,18.0047,18.8249,1.0005
cuda,mixer_b16_224,128,1.2795,90.9229,21.0438,1.0133
cuda,mixnet_l,128,1.2273,149.9722,47.7482,1.0129
cuda,mnasnet_100,128,1.6594,40.0512,26.5165,1.0047
cuda,mobilenetv2_100,128,1.6085,41.1217,27.4450,1.1731
cuda,mobilenetv3_large_100,128,1.6610,37.9995,29.8185,1.0052
cuda,mobilevit_s,64,1.5212,55.4152,53.6475,1.0258
cuda,nfnet_l0,128,1.4927,65.7078,32.4067,0.9980
cuda,pit_b_224,64,1.2286,57.9484,26.5321,0.9606
cuda,pnasnet5large,16,1.0000,198.2494,93.4641,1.3184
cuda,poolformer_m36,64,1.3486,103.9235,62.3196,1.1942
cuda,regnety_002,128,1.3030,32.4968,27.2439,1.0014
cuda,repvgg_a2,128,1.2485,59.7729,26.9209,1.0185
cuda,res2net101_26w_4s,64,1.0813,94.1773,86.6520,0.9655
cuda,res2net50_14w_8s,128,1.3251,109.5258,79.9578,0.9830
cuda,res2next50,128,1.2518,125.5008,43.9754,0.9756
cuda,resmlp_12_224,128,1.3060,45.2373,19.3709,1.1048
cuda,resnest101e,64,1.4346,108.1945,78.1993,1.1037
cuda,rexnet_100,128,1.4637,55.0121,41.2075,1.0862
cuda,selecsls42b,128,1.4284,44.6645,23.3892,1.0139
cuda,spnasnet_100,128,1.5908,45.3189,32.0148,1.0048
cuda,swin_base_patch4_window7_224,64,1.6164,89.5854,75.5848,0.9299
cuda,swsl_resnext101_32x16d,32,1.0175,110.0041,45.7853,1.0003
cuda,tf_efficientnet_b0,128,1.5271,55.7361,34.5551,1.1079
cuda,tf_mixnet_l,128,1.2369,155.9027,48.6695,1.0921
cuda,tinynet_a,128,1.3792,53.0640,40.6346,1.1108
cuda,tnt_s_patch16_224,128,3.1078,104.8486,59.6028,1.0660
cuda,twins_pcpvt_base,64,1.5921,67.4600,84.4977,1.0909
cuda,visformer_small,128,1.1952,72.8705,23.7303,1.0410
cuda,vit_base_patch16_224,64,1.1309,56.4866,22.0208,0.9804
cuda,volo_d1_224,64,1.6868,72.0957,65.3011,0.9729
1 dev name batch_size speedup abs_latency compilation_latency compression_ratio
2 cuda adv_inception_v3 128 1.5923 102.5292 51.6032 1.0472
3 cuda beit_base_patch16_224 64 1.3390 75.3027 29.7471 1.0156
4 cuda coat_lite_mini 128 2.0579 53.3689 37.1856 1.0437
5 cuda convmixer_768_32 32 1.0470 275.5328 23.8037 0.9999
6 cuda convnext_base 64 1.5084 80.1811 42.5659 1.0373
7 cuda crossvit_9_240 128 1.5392 37.1806 44.9986 0.9193
8 cuda cspdarknet53 64 1.4721 75.0403 35.2882 1.0547
9 cuda deit_base_distilled_patch16_224 64 1.1432 55.9737 23.4038 0.9816
10 cuda dla102 128 1.5282 123.7284 49.3612 1.0430
11 cuda dm_nfnet_f0 128 1.4354 79.7518 34.8994 1.1038
12 cuda dpn107 32 1.2412 83.8921 58.9111 0.9952
13 cuda eca_botnext26ts_256 128 1.5425 71.2406 28.8920 1.0270
14 cuda ese_vovnet19b_dw 128 1.4647 42.4837 18.0285 1.0135
15 cuda fbnetc_100 128 1.5795 53.8033 33.0222 1.0082
16 cuda gernet_l 128 1.1684 63.4230 26.8687 1.0053
17 cuda ghostnet_100 128 1.7812 54.4211 47.6168 1.0484
18 cuda gluon_inception_v3 128 1.5952 102.5018 50.0857 1.0469
19 cuda gmixer_24_224 128 1.6749 69.2430 42.0841 1.1921
20 cuda gmlp_s16_224 128 1.5886 79.2132 43.0142 1.2343
21 cuda hrnet_w18 128 1.3743 221.5304 134.2573 1.0100
22 cuda inception_v3 128 1.5847 102.8333 49.7648 1.0472
23 cuda jx_nest_base 32 1.3747 71.4190 61.4053 0.9905
24 cuda lcnet_050 128 1.8159 18.0047 18.8249 1.0005
25 cuda mixer_b16_224 128 1.2795 90.9229 21.0438 1.0133
26 cuda mixnet_l 128 1.2273 149.9722 47.7482 1.0129
27 cuda mnasnet_100 128 1.6594 40.0512 26.5165 1.0047
28 cuda mobilenetv2_100 128 1.6085 41.1217 27.4450 1.1731
29 cuda mobilenetv3_large_100 128 1.6610 37.9995 29.8185 1.0052
30 cuda mobilevit_s 64 1.5212 55.4152 53.6475 1.0258
31 cuda nfnet_l0 128 1.4927 65.7078 32.4067 0.9980
32 cuda pit_b_224 64 1.2286 57.9484 26.5321 0.9606
33 cuda pnasnet5large 16 1.0000 198.2494 93.4641 1.3184
34 cuda poolformer_m36 64 1.3486 103.9235 62.3196 1.1942
35 cuda regnety_002 128 1.3030 32.4968 27.2439 1.0014
36 cuda repvgg_a2 128 1.2485 59.7729 26.9209 1.0185
37 cuda res2net101_26w_4s 64 1.0813 94.1773 86.6520 0.9655
38 cuda res2net50_14w_8s 128 1.3251 109.5258 79.9578 0.9830
39 cuda res2next50 128 1.2518 125.5008 43.9754 0.9756
40 cuda resmlp_12_224 128 1.3060 45.2373 19.3709 1.1048
41 cuda resnest101e 64 1.4346 108.1945 78.1993 1.1037
42 cuda rexnet_100 128 1.4637 55.0121 41.2075 1.0862
43 cuda selecsls42b 128 1.4284 44.6645 23.3892 1.0139
44 cuda spnasnet_100 128 1.5908 45.3189 32.0148 1.0048
45 cuda swin_base_patch4_window7_224 64 1.6164 89.5854 75.5848 0.9299
46 cuda swsl_resnext101_32x16d 32 1.0175 110.0041 45.7853 1.0003
47 cuda tf_efficientnet_b0 128 1.5271 55.7361 34.5551 1.1079
48 cuda tf_mixnet_l 128 1.2369 155.9027 48.6695 1.0921
49 cuda tinynet_a 128 1.3792 53.0640 40.6346 1.1108
50 cuda tnt_s_patch16_224 128 3.1078 104.8486 59.6028 1.0660
51 cuda twins_pcpvt_base 64 1.5921 67.4600 84.4977 1.0909
52 cuda visformer_small 128 1.1952 72.8705 23.7303 1.0410
53 cuda vit_base_patch16_224 64 1.1309 56.4866 22.0208 0.9804
54 cuda volo_d1_224 64 1.6868 72.0957 65.3011 0.9729

View File

@@ -1,53 +0,0 @@
dev,name,batch_size,speedup,abs_latency,compilation_latency,compression_ratio
cuda,BERT_pytorch,16,1.7111,24.2741,35.7065,1.3212
cuda,LearningToPaint,96,1.0513,10.7557,11.1879,0.9896
cuda,Super_SloMo,6,1.3267,60.4328,28.2097,1.2392
cuda,alexnet,128,1.1754,8.3246,5.3319,1.0003
cuda,attention_is_all_you_need_pytorch,256,1.3416,36.4401,39.5927,1.1774
cuda,dcgan,32,0.9151,2.6249,3.2964,1.0082
cuda,densenet121,4,0.9225,51.3747,68.5841,0.9930
cuda,doctr_det_predictor,0,0.0000
cuda,doctr_reco_predictor,0,0.0000
cuda,drq,1,0.9500,3.4884,4.8028,0.9687
cuda,fastNLP_Bert,6,1.4328,34.7753,35.4863,1.2368
cuda,functorch_dp_cifar10,64,1.2015,8.1625,12.9040,1.0609
cuda,functorch_maml_omniglot,1,0.9322,2.5844,3.8640,1.0000
cuda,hf_Albert,8,2.1228,30.3377,26.8282,1.2676
cuda,hf_Bart,4,1.2899,39.1935,47.2373,1.0080
cuda,hf_Bert,4,1.3262,26.1063,35.0281,1.0656
cuda,hf_Bert_large,4,1.4163,55.1021,67.2825,1.0915
cuda,hf_DistilBert,8,1.4051,21.7191,18.0399,1.0242
cuda,hf_GPT2,4,1.6661,26.9039,29.9473,1.1555
cuda,hf_Longformer,0,0.0000
cuda,hf_Reformer,4,1.1709,64.6979,15.7035,0.9267
cuda,hf_T5_large,2,1.7215,107.0798,148.8805,1.1684
cuda,lennard_jones,1000,0.8428,1.8488,3.0609,1.0001
cuda,maml_omniglot,32,0.9648,2.6869,3.9775,0.9999
cuda,mnasnet1_0,32,1.0469,21.6251,25.8232,0.9996
cuda,mobilenet_v2,96,1.5604,31.9572,27.0225,1.1734
cuda,nvidia_deeprecommender,256,1.0605,9.2080,4.1318,0.9711
cuda,phlippe_densenet,128,1.0237,27.5988,28.0400,1.0023
cuda,phlippe_resnet,128,1.0493,10.9751,10.2485,1.0092
cuda,pytorch_CycleGAN_and_pix2pix,1,1.3724,8.2225,11.9561,1.0219
cuda,pytorch_stargan,16,1.1835,11.9178,10.0507,1.0868
cuda,pytorch_unet,1,1.3787,29.7543,13.7711,1.0100
cuda,resnet152,32,0.9834,63.2446,67.7935,0.9991
cuda,resnet18,16,0.9451,9.4977,11.7663,0.9948
cuda,resnet50,32,1.0513,24.5141,24.6629,1.0021
cuda,resnext50_32x4d,8,0.9216,22.2460,24.3420,0.9984
cuda,shufflenet_v2_x1_0,128,1.1943,25.4520,28.8611,1.0951
cuda,soft_actor_critic,256,0.8691,1.9637,3.3716,0.9996
cuda,speech_transformer,32,1.2718,35.2922,46.9957,1.0897
cuda,squeezenet1_1,32,1.1302,8.4540,7.9625,1.0771
cuda,timm_efficientdet,1,1.3370,80.0377,120.1814,1.2713
cuda,timm_efficientnet,32,1.1874,27.6302,33.9059,1.0971
cuda,timm_nfnet,128,1.4525,77.3461,34.3270,1.1056
cuda,timm_regnet,32,1.0644,50.6953,35.7562,1.0000
cuda,timm_resnest,32,1.6200,14.7763,17.2245,1.0906
cuda,timm_vision_transformer,32,1.0800,19.4188,22.0255,0.9966
cuda,timm_vision_transformer_large,32,1.0081,393.1742,127.8083,0.9735
cuda,timm_vovnet,32,1.1472,22.4727,22.7328,1.0120
cuda,torchrec_dlrm,0,0.0000
cuda,tts_angular,64,0.8974,6.5057,2.5555,0.9973
cuda,vgg16,64,1.2909,50.7405,6.1510,0.9828
cuda,yolov3,16,1.2930,54.8069,41.9269,1.0563
1 dev,name,batch_size,speedup,abs_latency,compilation_latency,compression_ratio
2 cuda,BERT_pytorch,16,1.7111,24.2741,35.7065,1.3212
3 cuda,LearningToPaint,96,1.0513,10.7557,11.1879,0.9896
4 cuda,Super_SloMo,6,1.3267,60.4328,28.2097,1.2392
5 cuda,alexnet,128,1.1754,8.3246,5.3319,1.0003
6 cuda,attention_is_all_you_need_pytorch,256,1.3416,36.4401,39.5927,1.1774
7 cuda,dcgan,32,0.9151,2.6249,3.2964,1.0082
8 cuda,densenet121,4,0.9225,51.3747,68.5841,0.9930
9 cuda,doctr_det_predictor,0,0.0000
10 cuda,doctr_reco_predictor,0,0.0000
11 cuda,drq,1,0.9500,3.4884,4.8028,0.9687
12 cuda,fastNLP_Bert,6,1.4328,34.7753,35.4863,1.2368
13 cuda,functorch_dp_cifar10,64,1.2015,8.1625,12.9040,1.0609
14 cuda,functorch_maml_omniglot,1,0.9322,2.5844,3.8640,1.0000
15 cuda,hf_Albert,8,2.1228,30.3377,26.8282,1.2676
16 cuda,hf_Bart,4,1.2899,39.1935,47.2373,1.0080
17 cuda,hf_Bert,4,1.3262,26.1063,35.0281,1.0656
18 cuda,hf_Bert_large,4,1.4163,55.1021,67.2825,1.0915
19 cuda,hf_DistilBert,8,1.4051,21.7191,18.0399,1.0242
20 cuda,hf_GPT2,4,1.6661,26.9039,29.9473,1.1555
21 cuda,hf_Longformer,0,0.0000
22 cuda,hf_Reformer,4,1.1709,64.6979,15.7035,0.9267
23 cuda,hf_T5_large,2,1.7215,107.0798,148.8805,1.1684
24 cuda,lennard_jones,1000,0.8428,1.8488,3.0609,1.0001
25 cuda,maml_omniglot,32,0.9648,2.6869,3.9775,0.9999
26 cuda,mnasnet1_0,32,1.0469,21.6251,25.8232,0.9996
27 cuda,mobilenet_v2,96,1.5604,31.9572,27.0225,1.1734
28 cuda,nvidia_deeprecommender,256,1.0605,9.2080,4.1318,0.9711
29 cuda,phlippe_densenet,128,1.0237,27.5988,28.0400,1.0023
30 cuda,phlippe_resnet,128,1.0493,10.9751,10.2485,1.0092
31 cuda,pytorch_CycleGAN_and_pix2pix,1,1.3724,8.2225,11.9561,1.0219
32 cuda,pytorch_stargan,16,1.1835,11.9178,10.0507,1.0868
33 cuda,pytorch_unet,1,1.3787,29.7543,13.7711,1.0100
34 cuda,resnet152,32,0.9834,63.2446,67.7935,0.9991
35 cuda,resnet18,16,0.9451,9.4977,11.7663,0.9948
36 cuda,resnet50,32,1.0513,24.5141,24.6629,1.0021
37 cuda,resnext50_32x4d,8,0.9216,22.2460,24.3420,0.9984
38 cuda,shufflenet_v2_x1_0,128,1.1943,25.4520,28.8611,1.0951
39 cuda,soft_actor_critic,256,0.8691,1.9637,3.3716,0.9996
40 cuda,speech_transformer,32,1.2718,35.2922,46.9957,1.0897
41 cuda,squeezenet1_1,32,1.1302,8.4540,7.9625,1.0771
42 cuda,timm_efficientdet,1,1.3370,80.0377,120.1814,1.2713
43 cuda,timm_efficientnet,32,1.1874,27.6302,33.9059,1.0971
44 cuda,timm_nfnet,128,1.4525,77.3461,34.3270,1.1056
45 cuda,timm_regnet,32,1.0644,50.6953,35.7562,1.0000
46 cuda,timm_resnest,32,1.6200,14.7763,17.2245,1.0906
47 cuda,timm_vision_transformer,32,1.0800,19.4188,22.0255,0.9966
48 cuda,timm_vision_transformer_large,32,1.0081,393.1742,127.8083,0.9735
49 cuda,timm_vovnet,32,1.1472,22.4727,22.7328,1.0120
50 cuda,torchrec_dlrm,0,0.0000
51 cuda,tts_angular,64,0.8974,6.5057,2.5555,0.9973
52 cuda,vgg16,64,1.2909,50.7405,6.1510,0.9828
53 cuda,yolov3,16,1.2930,54.8069,41.9269,1.0563

View File

@@ -1,54 +0,0 @@
#!/bin/bash
# remember where we started
ROOT="$(pwd)"
# torchinductor venv
whoami
python3 -m venv /opt/torchinductor_venv
# shellcheck source=/dev/null
source /opt/torchinductor_venv/bin/activate
# shellcheck source=/dev/null
source ./.github/workflows/torchinductor/scripts/common.sh
# pytorch nightly
pip3 install --force-reinstall --pre torch torchtext torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu118
# pytorch source to get torchbench for dynamo
cd /opt || exit
git clone --recursive https://github.com/pytorch/pytorch
cd pytorch || exit
# if you are updating an existing checkout
git submodule sync
git submodule update --init --recursive
cd ..
# required packages
pip3 install expecttest psutil
# torchbench
pip3 install pyyaml
git clone https://github.com/pytorch/benchmark.git
cd benchmark || exit
python3 install.py
cd ..
# timm
git clone https://github.com/huggingface/pytorch-image-models.git
cd pytorch-image-models || exit
pip3 install -e .
cd ..
# build our own triton
cd "$ROOT" || exit
cd python || exit
rm -rf build
pip3 install -e .
pip3 uninstall pytorch-triton -y
# clean up cache
rm -rf /tmp/torchinductor_root/
rm -rf ~/.triton/cache
rm -rf "$TEST_REPORTS_DIR"
# go back to where we started
cd "$ROOT" || exit

View File

@@ -1,41 +0,0 @@
#!/bin/bash
# remember where we started
ROOT="$(pwd)"
INDUCTOR="$ROOT"/.github/workflows/torchinductor
# shellcheck source=/dev/null
source /opt/torchinductor_venv/bin/activate
# shellcheck source=/dev/null
source "$INDUCTOR"/scripts/common.sh
# lock GPU clocks to 1350 MHz
sudo nvidia-smi -i 0 -pm 1
sudo nvidia-smi -i 0 --lock-gpu-clocks=1350,1350
cd "$PYTORCH_DIR" || exit
TEST_REPORTS_DIR=$TEST_REPORTS_DIR/perf
mkdir -p "$TEST_REPORTS_DIR"
for model in "${MODELS[@]}"; do
echo "Running performance test for $model"
python3 benchmarks/dynamo/"$model".py --ci --training --performance --disable-cudagraphs\
--device cuda --inductor --amp --output "$TEST_REPORTS_DIR"/"$model".csv
done
cd "$ROOT" || exit
for model in "${MODELS[@]}"; do
echo "Checking performance test for $model"
python3 "$INDUCTOR"/scripts/check_perf.py --new "$TEST_REPORTS_DIR"/"$model".csv --baseline "$INDUCTOR"/data/"$model".csv
EXIT_STATUS=$?
if [ "$EXIT_STATUS" -ne 0 ]; then
echo "Performance test for $model failed"
exit "$EXIT_STATUS"
fi
done
# unlock GPU clocks
sudo nvidia-smi -i 0 -rgc
# go back to where we started
cd "$ROOT" || exit

View File

@@ -8,7 +8,7 @@ jobs:
Build-Wheels:
runs-on: [self-hosted, V100]
runs-on: [self-hosted, CPU]
permissions:
id-token: write
contents: read