mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-10 15:18:11 -05:00
Compare commits
22 Commits
roc-6.3.x
...
docs_versi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cd85ccd539 | ||
|
|
de4ac7a5a3 | ||
|
|
fa0e212906 | ||
|
|
84001e176e | ||
|
|
9cd2706fdb | ||
|
|
13be0b6a51 | ||
|
|
efefa0f43e | ||
|
|
4d15adf284 | ||
|
|
1fb42c2591 | ||
|
|
e984954088 | ||
|
|
cd57bc8186 | ||
|
|
d7d3d02cd0 | ||
|
|
dd7164cada | ||
|
|
bf3a437cd5 | ||
|
|
4be8096109 | ||
|
|
934767322b | ||
|
|
1ea1c5c6e0 | ||
|
|
389fa7071b | ||
|
|
91e0cf5ecd | ||
|
|
1de89ef590 | ||
|
|
27cb8ea927 | ||
|
|
0c6f660d59 |
@@ -84,6 +84,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -67,6 +67,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -77,6 +77,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -67,6 +67,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -87,7 +87,6 @@ jobs:
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
gpuTarget: $(JOB_GPU_TARGET)
|
||||
|
||||
- job: Tensile_testing
|
||||
timeoutInMinutes: 90
|
||||
|
||||
@@ -42,6 +42,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -48,6 +48,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -52,6 +52,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -63,6 +63,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -72,6 +72,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
@@ -156,6 +158,7 @@ jobs:
|
||||
- deps
|
||||
|
||||
- job: hipBLASLt_testing
|
||||
timeoutInMinutes: 120
|
||||
dependsOn: hipBLASLt
|
||||
condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
|
||||
variables:
|
||||
|
||||
@@ -43,6 +43,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -54,6 +54,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -45,6 +45,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -57,6 +57,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -52,6 +52,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -105,6 +105,7 @@ jobs:
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
|
||||
-DCMAKE_Fortran_COMPILER=f95
|
||||
-DAMDGPU_TARGETS=$(JOB_GPU_TARGET)
|
||||
-DTensile_LOGIC=
|
||||
-DTensile_CPU_THREADS=
|
||||
|
||||
@@ -42,6 +42,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -51,6 +51,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -29,7 +29,7 @@ jobs:
|
||||
value: '$(Build.BinariesDirectory)/amdgcn/bitcode'
|
||||
- name: HIP_PATH
|
||||
value: '$(Agent.BuildDirectory)/rocm'
|
||||
pool: ${{ variables.MEDIUM_BUILD_POOL }}
|
||||
pool: ${{ variables.ULTRA_BUILD_POOL }}
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
@@ -51,7 +51,7 @@ jobs:
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH="$(Build.BinariesDirectory)/llvm;$(Build.BinariesDirectory)"
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DLLVM_ENABLE_PROJECTS=clang;lld;clang-tools-extra;mlir
|
||||
-DLLVM_ENABLE_PROJECTS=clang;lld;clang-tools-extra;mlir;flang
|
||||
-DLLVM_ENABLE_RUNTIMES=compiler-rt;libunwind;libcxx;libcxxabi
|
||||
-DCLANG_ENABLE_AMDCLANG=ON
|
||||
-DLLVM_TARGETS_TO_BUILD=AMDGPU;X86
|
||||
@@ -85,7 +85,7 @@ jobs:
|
||||
componentName: check-llvm
|
||||
testDir: 'llvm/build'
|
||||
testExecutable: './bin/llvm-lit'
|
||||
testParameters: '-q --xunit-xml-output=llvm_test_output.xml ./test'
|
||||
testParameters: '-q --xunit-xml-output=llvm_test_output.xml --filter-out="live-debug-values-spill-tracking" ./test'
|
||||
testOutputFile: llvm_test_output.xml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
|
||||
@@ -1,140 +0,0 @@
|
||||
# largely referenced from: https://github.com/ROCm/omnitrace/blob/main/.github/workflows/ubuntu-jammy.yml
|
||||
parameters:
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
- name: aptPackages
|
||||
type: object
|
||||
default:
|
||||
- autoconf
|
||||
- autotools-dev
|
||||
- bison
|
||||
- build-essential
|
||||
- bzip2
|
||||
- clang
|
||||
- cmake
|
||||
- environment-modules
|
||||
- g++-12
|
||||
- libdrm-dev
|
||||
- libfabric-dev
|
||||
- libiberty-dev
|
||||
- libpapi-dev
|
||||
- libpfm4-dev
|
||||
- libtool
|
||||
- libopenmpi-dev
|
||||
- m4
|
||||
- openmpi-bin
|
||||
- software-properties-common
|
||||
- python3-pip
|
||||
- texinfo
|
||||
- zlib1g-dev
|
||||
- name: pipModules
|
||||
type: object
|
||||
default:
|
||||
- numpy
|
||||
- perfetto
|
||||
- dataclasses
|
||||
- name: rocmDependencies
|
||||
type: object
|
||||
default:
|
||||
- aomp
|
||||
- clr
|
||||
- llvm-project
|
||||
- rccl
|
||||
- rocm-core
|
||||
- rocm_smi_lib
|
||||
- rocminfo
|
||||
- ROCR-Runtime
|
||||
- rocprofiler
|
||||
- rocprofiler-register
|
||||
- roctracer
|
||||
|
||||
jobs:
|
||||
- job: omnitrace
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ variables.MEDIUM_BUILD_POOL }}
|
||||
workspace:
|
||||
clean: all
|
||||
strategy:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
gpuTarget: $(JOB_GPU_TARGET)
|
||||
- task: Bash@3
|
||||
displayName: ROCm symbolic link
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
sudo rm -rf /opt/rocm
|
||||
sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
|
||||
- task: Bash@3
|
||||
displayName: Add ROCm binaries to PATH
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
|
||||
- task: Bash@3
|
||||
displayName: Add ROCm compilers to PATH
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
# build flags reference: https://rocm.docs.amd.com/projects/omnitrace/en/latest/install/install.html
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
|
||||
-DOMNITRACE_BUILD_TESTING=ON
|
||||
-DOMNITRACE_BUILD_DYNINST=ON
|
||||
-DOMNITRACE_BUILD_LIBUNWIND=ON
|
||||
-DDYNINST_BUILD_TBB=ON
|
||||
-DDYNINST_BUILD_ELFUTILS=ON
|
||||
-DDYNINST_BUILD_LIBIBERTY=ON
|
||||
-DDYNINST_BUILD_BOOST=ON
|
||||
-DOMNITRACE_USE_PAPI=ON
|
||||
-DOMNITRACE_USE_MPI=ON
|
||||
-DAMDGPU_TARGETS=$(JOB_GPU_TARGET)
|
||||
multithreadFlag: -- -j32
|
||||
- task: Bash@3
|
||||
displayName: Set up omnitrace env
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: source share/omnitrace/setup-env.sh
|
||||
workingDirectory: build
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: omnitrace
|
||||
- task: Bash@3
|
||||
displayName: Remove ROCm binaries from PATH
|
||||
condition: always()
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: echo "##vso[task.setvariable variable=PATH]$(echo $PATH | sed -e 's;:$(Agent.BuildDirectory)/rocm/bin;;' -e 's;^/;;' -e 's;/$;;')"
|
||||
- task: Bash@3
|
||||
displayName: Remove ROCm compilers from PATH
|
||||
condition: always()
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: echo "##vso[task.setvariable variable=PATH]$(echo $PATH | sed -e 's;:$(Agent.BuildDirectory)/rocm/llvm/bin;;' -e 's;^/;;' -e 's;/$;;')"
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
gpuTarget: $(JOB_GPU_TARGET)
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
gpuTarget: $(JOB_GPU_TARGET)
|
||||
@@ -64,6 +64,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -65,6 +65,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -73,6 +73,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- task: Bash@3
|
||||
displayName: 'Register libjpeg-turbo packages'
|
||||
|
||||
@@ -60,6 +60,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -75,6 +75,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -55,6 +55,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -47,6 +47,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -42,6 +42,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -48,6 +48,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -45,6 +45,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -58,6 +58,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -56,6 +56,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -47,6 +47,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -57,6 +57,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -8,7 +8,6 @@ parameters:
|
||||
- name: aptPackages
|
||||
type: object
|
||||
default:
|
||||
- cmake
|
||||
- doxygen
|
||||
- doxygen-doc
|
||||
- ninja-build
|
||||
@@ -18,7 +17,9 @@ parameters:
|
||||
type: object
|
||||
default:
|
||||
- cget
|
||||
- cmake==3.20.5
|
||||
- ninja
|
||||
- rocm-docs-core
|
||||
|
||||
jobs:
|
||||
- job: rocm_cmake
|
||||
@@ -33,21 +34,29 @@ jobs:
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
- task: Bash@3
|
||||
displayName: Add CMake to PATH
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: echo "##vso[task.prependpath]$(python3 -m site --user-base)/bin"
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
# extra steps for ctest suite
|
||||
- script: |
|
||||
python -m pip install -r $(Build.SourcesDirectory)/docs/requirements.txt
|
||||
python -m pip install -r $(Build.SourcesDirectory)/test/docsphinx/docs/.sphinx/requirements.txt
|
||||
git config --global user.email "you@example.com"
|
||||
git config --global user.name "Your Name"
|
||||
displayName: "ctest setup"
|
||||
- task: Bash@3
|
||||
displayName: CTest setup
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
python -m pip install -r $(Build.SourcesDirectory)/docs/requirements.txt
|
||||
python -m pip install -r $(Build.SourcesDirectory)/test/docsphinx/docs/.sphinx/requirements.txt
|
||||
git config --global user.email "you@example.com"
|
||||
git config --global user.name "Your Name"
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: rocm-cmake
|
||||
testParameters: '-E "pass-version-parent" -VV --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
@@ -56,4 +65,3 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
environment: combined
|
||||
gpuTarget: $(JOB_GPU_TARGET)
|
||||
|
||||
@@ -75,6 +75,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -10,6 +10,7 @@ parameters:
|
||||
default:
|
||||
- cmake
|
||||
- libdrm-dev
|
||||
- pkg-config
|
||||
- python3-pip
|
||||
|
||||
jobs:
|
||||
@@ -39,7 +40,6 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
gpuTarget: $(JOB_GPU_TARGET)
|
||||
|
||||
- job: rocm_smi_lib_testing
|
||||
dependsOn: rocm_smi_lib
|
||||
|
||||
@@ -59,6 +59,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -57,6 +57,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -72,6 +72,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -57,6 +57,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -69,7 +69,6 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
gpuTarget: $(JOB_GPU_TARGET)
|
||||
|
||||
- job: rocr_debug_agent_testing
|
||||
dependsOn: rocr_debug_agent
|
||||
|
||||
@@ -11,6 +11,7 @@ parameters:
|
||||
- cmake
|
||||
- doxygen
|
||||
- graphviz
|
||||
- libdrm-amdgpu-dev
|
||||
- ninja-build
|
||||
- python3-pip
|
||||
- name: pipModules
|
||||
@@ -49,11 +50,14 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
registerROCmPackages: true
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
@@ -85,6 +89,7 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
gpuTarget: $(JOB_GPU_TARGET)
|
||||
registerROCmPackages: true
|
||||
|
||||
- job: roctracer_testing
|
||||
dependsOn: roctracer
|
||||
@@ -104,6 +109,8 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
registerROCmPackages: true
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
@@ -128,3 +135,4 @@ jobs:
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
environment: test
|
||||
gpuTarget: $(JOB_GPU_TARGET)
|
||||
registerROCmPackages: true
|
||||
|
||||
@@ -57,6 +57,8 @@ jobs:
|
||||
matrix:
|
||||
gfx942:
|
||||
JOB_GPU_TARGET: gfx942
|
||||
gfx90a:
|
||||
JOB_GPU_TARGET: gfx90a
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
|
||||
@@ -222,13 +222,13 @@ parameters:
|
||||
hasGpuTarget: false
|
||||
rocm-examples:
|
||||
pipelineId: $(ROCM_EXAMPLES_PIPELINE_ID)
|
||||
stagingBranch: develop
|
||||
mainlineBranch: develop
|
||||
stagingBranch: amd-staging
|
||||
mainlineBranch: amd-mainline
|
||||
hasGpuTarget: true
|
||||
rocminfo:
|
||||
pipelineId: $(ROCMINFO_PIPELINE_ID)
|
||||
stagingBranch: amd-staging
|
||||
mainlineBranch: amd-master
|
||||
mainlineBranch: amd-mainline
|
||||
hasGpuTarget: false
|
||||
rocMLIR:
|
||||
pipelineId: $(ROCMLIR_PIPELINE_ID)
|
||||
@@ -262,7 +262,7 @@ parameters:
|
||||
hasGpuTarget: true
|
||||
rocprofiler-compute:
|
||||
pipelineId: $(ROCPROFILER_COMPUTE_PIPELINE_ID)
|
||||
stagingBranch: amd-staging
|
||||
stagingBranch: develop
|
||||
mainlineBranch: amd-mainline
|
||||
hasGpuTarget: true
|
||||
rocprofiler-register:
|
||||
|
||||
@@ -33,7 +33,6 @@ parameters:
|
||||
- aomp
|
||||
- HIPIFY
|
||||
- MIVisionX
|
||||
- rocm-cmake
|
||||
- rocm_smi_lib
|
||||
- rocprofiler-sdk
|
||||
- roctracer
|
||||
|
||||
@@ -28,13 +28,13 @@ variables:
|
||||
- name: GFX942_TEST_POOL
|
||||
value: gfx942_test_pool
|
||||
- name: LATEST_RELEASE_VERSION
|
||||
value: 6.3.2
|
||||
value: 6.3.3
|
||||
- name: REPO_RADEON_VERSION
|
||||
value: 6.3.2
|
||||
value: 6.3.3
|
||||
- name: NEXT_RELEASE_VERSION
|
||||
value: 6.4.0
|
||||
- name: LATEST_RELEASE_TAG
|
||||
value: rocm-6.3.2
|
||||
value: rocm-6.3.3
|
||||
- name: AMDMIGRAPHX_GFX942_TEST_PIPELINE_ID
|
||||
value: 197
|
||||
- name: AMDMIGRAPHX_PIPELINE_ID
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -11,6 +11,7 @@ _toc.yml
|
||||
docBin/
|
||||
_doxygen/
|
||||
_readthedocs/
|
||||
__pycache__/
|
||||
|
||||
# avoid duplicating contributing.md due to conf.py
|
||||
docs/CHANGELOG.md
|
||||
|
||||
@@ -117,6 +117,7 @@ FX
|
||||
Filesystem
|
||||
FindDb
|
||||
Flang
|
||||
FluxBenchmark
|
||||
Fortran
|
||||
Fuyu
|
||||
GALB
|
||||
@@ -131,6 +132,7 @@ GDS
|
||||
GEMM
|
||||
GEMMs
|
||||
GFortran
|
||||
GFXIP
|
||||
Gemma
|
||||
GiB
|
||||
GIM
|
||||
@@ -316,6 +318,7 @@ PipelineParallel
|
||||
PnP
|
||||
PowerEdge
|
||||
PowerShell
|
||||
Pretraining
|
||||
Profiler's
|
||||
PyPi
|
||||
Pytest
|
||||
@@ -478,6 +481,7 @@ ZenDNN
|
||||
accuracies
|
||||
activations
|
||||
addr
|
||||
ai
|
||||
alloc
|
||||
allocatable
|
||||
allocator
|
||||
@@ -543,6 +547,7 @@ cTDP
|
||||
dataset
|
||||
datasets
|
||||
dataspace
|
||||
datatemplate
|
||||
datatype
|
||||
datatypes
|
||||
dbgapi
|
||||
@@ -571,6 +576,7 @@ el
|
||||
embeddings
|
||||
enablement
|
||||
encodings
|
||||
endfor
|
||||
endpgm
|
||||
enqueue
|
||||
env
|
||||
@@ -691,6 +697,7 @@ pageable
|
||||
pallas
|
||||
parallelization
|
||||
parallelizing
|
||||
param
|
||||
parameterization
|
||||
passthrough
|
||||
perfcounter
|
||||
@@ -715,6 +722,7 @@ preprocessing
|
||||
preprocessor
|
||||
prequantized
|
||||
prerequisites
|
||||
pretraining
|
||||
profiler
|
||||
profilers
|
||||
protobuf
|
||||
@@ -807,6 +815,7 @@ supercomputing
|
||||
symlink
|
||||
symlinks
|
||||
sys
|
||||
tabindex
|
||||
td
|
||||
tensorfloat
|
||||
th
|
||||
@@ -852,6 +861,7 @@ vectorizes
|
||||
virtualize
|
||||
virtualized
|
||||
vjxb
|
||||
vllm
|
||||
voxel
|
||||
walkthrough
|
||||
walkthroughs
|
||||
|
||||
@@ -66,11 +66,10 @@ project-specific steps. Refer to each repository's PR process for any additional
|
||||
during our release cycle, as coordinated by the maintainer
|
||||
* We'll inform you once your change is committed
|
||||
|
||||
:::{important}
|
||||
By creating a PR, you agree to allow your contribution to be licensed under the
|
||||
terms of the LICENSE.txt file in the corresponding repository. Different repositories may use different
|
||||
licenses.
|
||||
:::
|
||||
> [!IMPORTANT]
|
||||
> By creating a PR, you agree to allow your contribution to be licensed under the
|
||||
> terms of the LICENSE.txt file in the corresponding repository. Different repositories may use different
|
||||
> licenses.
|
||||
|
||||
You can look up each license on the [ROCm licensing](https://rocm.docs.amd.com/en/latest/about/license.html) page.
|
||||
|
||||
|
||||
@@ -116,7 +116,7 @@ bash install-prerequisites.sh
|
||||
# For ubuntu22.04 system
|
||||
cd ROCm/tools/rocm-build/docker/ubuntu22
|
||||
cp * /tmp && cd /tmp
|
||||
bash install-prerequisities.sh
|
||||
bash install-prerequisites.sh
|
||||
# For ubuntu24.04 system
|
||||
cd ROCm/tools/rocm-build/docker/ubuntu24
|
||||
cp * /tmp && cd /tmp
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
:description: JAX compatibility
|
||||
:keywords: GPU, JAX compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
*******************************************************************************
|
||||
JAX compatibility
|
||||
*******************************************************************************
|
||||
@@ -119,7 +121,8 @@ Critical ROCm libraries for JAX
|
||||
|
||||
The functionality of JAX with ROCm is determined by its underlying library
|
||||
dependencies. These critical ROCm components affect the capabilities,
|
||||
performance, and feature set available to developers.
|
||||
performance, and feature set available to developers. The versions described
|
||||
are available in ROCm :version:`rocm_version`.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -129,7 +132,7 @@ performance, and feature set available to developers.
|
||||
- Purpose
|
||||
- Used in
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
|
||||
- 2.3.0
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
- Matrix multiplication in ``jax.numpy.matmul``, ``jax.lax.dot`` and
|
||||
@@ -138,7 +141,7 @@ performance, and feature set available to developers.
|
||||
``jax.numpy.einsum`` with matrix-multiplication patterns algebra
|
||||
operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
|
||||
- 0.10.0
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- hipBLASLt is an extension of hipBLAS, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
@@ -147,7 +150,7 @@ performance, and feature set available to developers.
|
||||
operations, mixed-precision support, and hardware-specific
|
||||
optimizations.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`hipCUB rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
- Reduction functions (``jax.numpy.sum``, ``jax.numpy.mean``,
|
||||
@@ -155,23 +158,23 @@ performance, and feature set available to developers.
|
||||
(``jax.numpy.cumsum``, ``jax.numpy.cumprod``) and sorting
|
||||
(``jax.numpy.sort``, ``jax.numpy.argsort``).
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
|
||||
- 1.0.17
|
||||
- :version-ref:`hipFFT rocm_version`
|
||||
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
|
||||
- Used in functions like ``jax.numpy.fft``.
|
||||
* - `hipRAND <https://github.com/ROCm/hipRAND>`_
|
||||
- 2.11.0
|
||||
- :version-ref:`hipRAND rocm_version`
|
||||
- Provides fast random number generation for GPUs.
|
||||
- The ``jax.random.uniform``, ``jax.random.normal``,
|
||||
``jax.random.randint`` and ``jax.random.split``.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
|
||||
- 2.3.0
|
||||
- :version-ref:`hipSOLVER rocm_version`
|
||||
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
|
||||
singular value decompositions (SVD).
|
||||
- Solving linear systems (``jax.numpy.linalg.solve``), matrix
|
||||
factorizations, SVD (``jax.numpy.linalg.svd``) and eigenvalue problems
|
||||
(``jax.numpy.linalg.eig``).
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
|
||||
- 3.1.2
|
||||
- :version-ref:`hipSPARSE rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
- Sparse matrix multiplication (``jax.numpy.matmul``), sparse
|
||||
@@ -179,28 +182,28 @@ performance, and feature set available to developers.
|
||||
(``jax.experimental.sparse.dot``), sparse linear system solvers and
|
||||
sparse data handling.
|
||||
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
|
||||
- 0.2.2
|
||||
- :version-ref:`hipSPARSELt rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
- Sparse matrix multiplication (``jax.numpy.matmul``), sparse
|
||||
matrix-vector and matrix-matrix products
|
||||
(``jax.experimental.sparse.dot``) and sparse linear system solvers.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`MIOpen rocm_version`
|
||||
- Optimized for deep learning primitives such as convolutions, pooling,
|
||||
normalization, and activation functions.
|
||||
- Speeds up convolutional neural networks (CNNs), recurrent neural
|
||||
networks (RNNs), and other layers. Used in operations like
|
||||
``jax.nn.conv``, ``jax.nn.relu``, and ``jax.nn.batch_norm``.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`_
|
||||
- 2.21.5
|
||||
- :version-ref:`RCCL rocm_version`
|
||||
- Optimized for multi-GPU communication for operations like all-reduce,
|
||||
broadcast, and scatter.
|
||||
- Distribute computations across multiple GPU with ``pmap`` and
|
||||
``jax.distributed``. XLA automatically uses rccl when executing
|
||||
operations across multiple GPUs on AMD hardware.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`rocThrust rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
- Reduction operations like ``jax.numpy.sum``, ``jax.pmap`` for
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
:description: PyTorch compatibility
|
||||
:keywords: GPU, PyTorch compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
********************************************************************************
|
||||
PyTorch compatibility
|
||||
********************************************************************************
|
||||
@@ -56,7 +58,7 @@ Docker image compatibility
|
||||
|
||||
AMD validates and publishes ready-made `PyTorch images <https://hub.docker.com/r/rocm/pytorch>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories are validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
|
||||
associated inventories are validated for `ROCm 6.3.3 <https://repo.radeon.com/rocm/apt/6.3.3/>`_.
|
||||
Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
.. list-table:: PyTorch Docker image components
|
||||
@@ -77,26 +79,26 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu24.04_py3.12_pytorch_release_2.4.0/images/sha256-98ddf20333bd01ff749b8092b1190ee369a75d3b8c71c2fac80ffdcb1a98d529?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu24.04_py3.12_pytorch_release_2.4.0/images/sha256-6c798857b2c9526b44ba535710b93a1737546acea79b53a93c646195c272f1d5"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
- `3.12 <https://www.python.org/downloads/release/python-3128/>`_
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
|
||||
- `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
|
||||
- `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
|
||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.4.0/images/sha256-402c9b4f1a6b5a81c634a1932b56cbe01abb699cfcc7463d226276997c6cf8ea?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.10_pytorch_release_2.4.0/images/sha256-a09b21248133876fc8912a5ff4e6ee2c8d62b14120313e426b3dadda5702713d"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
||||
@@ -107,11 +109,11 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.9_pytorch_release_2.4.0/images/sha256-e0608b55d408c3bfe5c19fdd57a4ced3e0eb3a495b74c309980b60b156c526dd?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.9_pytorch_release_2.4.0/images/sha256-963187534467f0f9da77996762fc1d112a6faa5372277c348a505533e7876ec8"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 22.04
|
||||
- `3.9.18 <https://www.python.org/downloads/release/python-3918/>`_
|
||||
- `3.9.21 <https://www.python.org/downloads/release/python-3921/>`_
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
||||
@@ -122,11 +124,11 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-652cf25263d05b1de548222970aeb76e60b12de101de66751264709c0d0ff9d8?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-952f2621bd2bf3078bef19061e05b209105a82a7908e7e6cdf85014938a4d93a"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`_
|
||||
- `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`_
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
||||
@@ -137,7 +139,7 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.2.1/images/sha256-051976f26beab8f9aa65d999e3ad546c027b39240a0cc3ee81b114a9024f2912?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.10_pytorch_release_2.2.1/images/sha256-a2fe20e170feb9e05da3e5728bb98e40d08567e137be8e6ba797962ed2852608"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.2.1 <https://github.com/ROCm/pytorch/tree/release/2.2>`_
|
||||
- 22.04
|
||||
@@ -152,7 +154,7 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu20.04_py3.9_pytorch_release_2.2.1/images/sha256-88c839a364d109d3748c100385bfa100d28090d25118cc723fd0406390ab2f7e?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu20.04_py3.9_pytorch_release_2.2.1/images/sha256-7f231937c897cca5f89e360be33c70a2017d60f62d1fbe81292be48c15fe345b"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.2.1 <https://github.com/ROCm/pytorch/tree/release/2.2>`_
|
||||
- 20.04
|
||||
@@ -167,14 +169,14 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.9_pytorch_release_1.13.1/images/sha256-994424ed07a63113f79dd9aa72159124c00f5fbfe18127151e6658f7d0b6f821?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu22.04_py3.9_pytorch_release_1.13.1/images/sha256-616a47758004f91951e2da6c1fe291f903de65a7b2318d4b18359b48fe3032f4"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `1.13.1 <https://github.com/ROCm/pytorch/tree/release/1.13>`_
|
||||
- 22.04
|
||||
- `3.9.21 <https://www.python.org/downloads/release/python-3921/>`_
|
||||
- `1.0.0 <https://github.com/ROCm/apex/tree/release/1.0.0>`_
|
||||
- `0.14.0 <https://github.com/pytorch/vision/tree/v0.14.0>`_
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18>`_
|
||||
- `2.19.0 <https://github.com/tensorflow/tensorboard/tree/2.19>`_
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
||||
- `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
|
||||
- `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
|
||||
@@ -182,7 +184,7 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu20.04_py3.9_pytorch_release_1.13.1/images/sha256-7b8139fe40a9aeb4bca3aecd15c22c1fa96e867d93479fa3a24fdeeeeafa1219?context=explore"><i class="fab fa-docker fa-lg"></i></a>
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3.3_ubuntu20.04_py3.9_pytorch_release_1.13.1/images/sha256-a2cfb365aea58b84595e241ffdb0d5ef3e6566e98c10b5499f4aa29983a74ea2"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `1.13.1 <https://github.com/ROCm/pytorch/tree/release/1.13>`_
|
||||
- 20.04
|
||||
@@ -200,7 +202,8 @@ Critical ROCm libraries for PyTorch
|
||||
|
||||
The functionality of PyTorch with ROCm is determined by its underlying library
|
||||
dependencies. These critical ROCm components affect the capabilities,
|
||||
performance, and feature set available to developers.
|
||||
performance, and feature set available to developers. The versions described
|
||||
are available in ROCm :version:`rocm_version`.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -210,28 +213,28 @@ performance, and feature set available to developers.
|
||||
- Purpose
|
||||
- Used in
|
||||
* - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
|
||||
- 1.1.0
|
||||
- :version-ref:`"Composable Kernel" rocm_version`
|
||||
- Enables faster execution of core operations like matrix multiplication
|
||||
(GEMM), convolutions and transformations.
|
||||
- Speeds up ``torch.permute``, ``torch.view``, ``torch.matmul``,
|
||||
``torch.mm``, ``torch.bmm``, ``torch.nn.Conv2d``, ``torch.nn.Conv3d``
|
||||
and ``torch.nn.MultiheadAttention``.
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
|
||||
- 2.3.0
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
- Supports operations like matrix multiplication, matrix-vector products,
|
||||
and tensor contractions. Utilized in both dense and batched linear
|
||||
algebra operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
|
||||
- 0.10.0
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- hipBLASLt is an extension of the hipBLAS library, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
- It accelerates operations like ``torch.matmul``, ``torch.mm``, and the
|
||||
matrix multiplications used in convolutional and linear layers.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`hipCUB rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
- Supports operations like ``torch.sum``, ``torch.cumsum``, ``torch.sort``
|
||||
@@ -239,93 +242,93 @@ performance, and feature set available to developers.
|
||||
irregular shapes often involve scanning, sorting, and filtering, which
|
||||
hipCUB handles efficiently.
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
|
||||
- 1.0.17
|
||||
- :version-ref:`hipFFT rocm_version`
|
||||
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
|
||||
- Used in functions like the ``torch.fft`` module.
|
||||
* - `hipRAND <https://github.com/ROCm/hipRAND>`_
|
||||
- 2.11.0
|
||||
- :version-ref:`hipRAND rocm_version`
|
||||
- Provides fast random number generation for GPUs.
|
||||
- The ``torch.rand``, ``torch.randn`` and stochastic layers like
|
||||
``torch.nn.Dropout``.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
|
||||
- 2.3.0
|
||||
- :version-ref:`hipSOLVER rocm_version`
|
||||
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
|
||||
singular value decompositions (SVD).
|
||||
- Supports functions like ``torch.linalg.solve``,
|
||||
``torch.linalg.eig``, and ``torch.linalg.svd``.
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
|
||||
- 3.1.2
|
||||
- :version-ref:`hipSPARSE rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
- Sparse tensor operations ``torch.sparse``.
|
||||
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
|
||||
- 0.2.2
|
||||
- :version-ref:`hipSPARSELt rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
- Sparse tensor operations ``torch.sparse``.
|
||||
* - `hipTensor <https://github.com/ROCm/hipTensor>`_
|
||||
- 1.4.0
|
||||
- :version-ref:`hipTensor rocm_version`
|
||||
- Optimizes for high-performance tensor operations, such as contractions.
|
||||
- Accelerates tensor algebra, especially in deep learning and scientific
|
||||
computing.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`MIOpen rocm_version`
|
||||
- Optimizes deep learning primitives such as convolutions, pooling,
|
||||
normalization, and activation functions.
|
||||
- Speeds up convolutional neural networks (CNNs), recurrent neural
|
||||
networks (RNNs), and other layers. Used in operations like
|
||||
``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
|
||||
* - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
|
||||
- 2.11.0
|
||||
- :version-ref:`MIGraphX rocm_version`
|
||||
- Adds graph-level optimizations, ONNX models and mixed precision support
|
||||
and enable Ahead-of-Time (AOT) Compilation.
|
||||
- Speeds up inference models and executes ONNX models for
|
||||
compatibility with other frameworks.
|
||||
``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
|
||||
* - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
|
||||
- 3.1.0
|
||||
- :version-ref:`MIVisionX rocm_version`
|
||||
- Optimizes acceleration for computer vision and AI workloads like
|
||||
preprocessing, augmentation, and inferencing.
|
||||
- Faster data preprocessing and augmentation pipelines for datasets like
|
||||
ImageNet or COCO and easy to integrate into PyTorch's ``torch.utils.data``
|
||||
and ``torchvision`` workflows.
|
||||
* - `rocAL <https://github.com/ROCm/rocAL>`_
|
||||
- 2.1.0
|
||||
- :version-ref:`rocAL rocm_version`
|
||||
- Accelerates the data pipeline by offloading intensive preprocessing and
|
||||
augmentation tasks. rocAL is part of MIVisionX.
|
||||
- Easy to integrate into PyTorch's ``torch.utils.data`` and
|
||||
``torchvision`` data load workloads.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`_
|
||||
- 2.21.5
|
||||
- :version-ref:`RCCL rocm_version`
|
||||
- Optimizes for multi-GPU communication for operations like AllReduce and
|
||||
Broadcast.
|
||||
- Distributed data parallel training (``torch.nn.parallel.DistributedDataParallel``).
|
||||
Handles communication in multi-GPU setups.
|
||||
* - `rocDecode <https://github.com/ROCm/rocDecode>`_
|
||||
- 0.8.0
|
||||
- :version-ref:`rocDecode rocm_version`
|
||||
- Provides hardware-accelerated data decoding capabilities, particularly
|
||||
for image, video, and other dataset formats.
|
||||
- Can be integrated in ``torch.utils.data``, ``torchvision.transforms``
|
||||
and ``torch.distributed``.
|
||||
* - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
|
||||
- 0.6.0
|
||||
- :version-ref:`rocJPEG rocm_version`
|
||||
- Provides hardware-accelerated JPEG image decoding and encoding.
|
||||
- GPU accelerated ``torchvision.io.decode_jpeg`` and
|
||||
``torchvision.io.encode_jpeg`` and can be integrated in
|
||||
``torch.utils.data`` and ``torchvision``.
|
||||
* - `RPP <https://github.com/ROCm/RPP>`_
|
||||
- 1.9.1
|
||||
- :version-ref:`RPP rocm_version`
|
||||
- Speeds up data augmentation, transformation, and other preprocessing steps.
|
||||
- Easy to integrate into PyTorch's ``torch.utils.data`` and
|
||||
``torchvision`` data load workloads.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`rocThrust rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
- Utilized in backend operations for tensor computations requiring
|
||||
parallel processing.
|
||||
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
|
||||
- 1.6.0
|
||||
- :version-ref:`rocWMMA rocm_version`
|
||||
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
|
||||
multiplication (GEMM) and accumulation operations with mixed precision
|
||||
support.
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
:description: TensorFlow compatibility
|
||||
:keywords: GPU, TensorFlow compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
*******************************************************************************
|
||||
TensorFlow compatibility
|
||||
*******************************************************************************
|
||||
@@ -54,7 +56,7 @@ Docker image compatibility
|
||||
AMD validates and publishes ready-made `TensorFlow images
|
||||
<https://hub.docker.com/r/rocm/tensorflow>`_ with ROCm backends on
|
||||
Docker Hub. The following Docker image tags and associated inventories are
|
||||
validated for `ROCm 6.3.1 <https://repo.radeon.com/rocm/apt/6.3.1/>`_. Click
|
||||
validated for `ROCm 6.3.3 <https://repo.radeon.com/rocm/apt/6.3.3/>`_. Click
|
||||
the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
.. list-table:: TensorFlow Docker image components
|
||||
@@ -68,47 +70,47 @@ the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.12-tf2.17.0-dev/images/sha256-804121ee4985718277ba7dcec53c57bdade130a1ef42f544b6c48090ad379c17"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.12-tf2.17-dev/images/sha256-fd2653f436880366cc874aa24264ca9dabd892d76ccb63fb807debba459bcaaf"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.17.0-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||
- dev
|
||||
- `Python 3.12 <https://www.python.org/downloads/release/python-3124/>`_
|
||||
- `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
|
||||
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.10-tf2.17.0-dev/images/sha256-776837ffa945913f6c466bfe477810a11453d21d5b6afb200be1c36e48fbc08e"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.10-tf2.17-dev/images/sha256-8a5eb7443798935dd269575e2abae847b702e1dfb06766ab84f081a6314d8b95"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.17.0-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
||||
- dev
|
||||
- `Python 3.10 <https://www.python.org/downloads/release/python-31012/>`_
|
||||
- `TensorBoard 2.17.0 <https://github.com/tensorflow/tensorboard/tree/2.17.0>`_
|
||||
- `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.12-tf2.16.2-dev/images/sha256-c793e1483e30809c3c28fc5d7805bedc033c73da224f839fff370717cb100944"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.12-tf2.16-dev/images/sha256-8fc939b10cdd6d2b11407474880d4c8ab2b52ab6e2d1743c921fc2adbfd0422f"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||
- dev
|
||||
- `Python 3.12 <https://www.python.org/downloads/release/python-3124/>`_
|
||||
- `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
|
||||
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.10-tf2.16.0-dev/images/sha256-263e78414ae85d7bcd52a025a94131d0a279872a45ed632b9165336dfdcd4443"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.10-tf2.16-dev/images/sha256-a4cc6ab23d59fdf5459ceac1f0a603e6c16ae7f885d30e42c0c2b3ac60c2ad10"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.16.2-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
||||
- dev
|
||||
- `Python 3.10 <https://www.python.org/downloads/release/python-31012/>`_
|
||||
- `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.1-py3.10-tf2.15.0-dev/images/sha256-479046a8477ca701a9494a813ab17e8ab4f6baa54641e65dc8d07629f1e6a880"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.10-tf2.15-dev/images/sha256-60887c488421184adcb60b9ed4f72a8bd7bdb64d238e50943ca7cbde38e4aa48"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.15.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.15.1-cp310-cp310-manylinux_2_28_x86_64.whl>`_
|
||||
- dev
|
||||
- `Python 3.10 <https://www.python.org/downloads/release/python-31012/>`_
|
||||
- `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `TensorBoard 2.15.2 <https://github.com/tensorflow/tensorboard/tree/2.15.2>`_
|
||||
|
||||
Critical ROCm libraries for TensorFlow
|
||||
@@ -117,7 +119,8 @@ Critical ROCm libraries for TensorFlow
|
||||
TensorFlow depends on multiple components and the supported features of those
|
||||
components can affect the TensorFlow ROCm supported feature set. The versions
|
||||
in the following table refer to the first TensorFlow version where the ROCm
|
||||
library was introduced as a dependency.
|
||||
library was introduced as a dependency. The versions described
|
||||
are available in ROCm :version:`rocm_version`.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25, 10, 35, 30
|
||||
@@ -128,43 +131,43 @@ library was introduced as a dependency.
|
||||
- Purpose
|
||||
- Used in
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
|
||||
- 2.3.0
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
- Accelerates operations like ``tf.matmul``, ``tf.linalg.matmul``, and
|
||||
other matrix multiplications commonly used in neural network layers.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
|
||||
- 0.10.0
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- Extends hipBLAS with additional optimizations like fused kernels and
|
||||
integer tensor cores.
|
||||
- Optimizes matrix multiplications and linear algebra operations used in
|
||||
layers like dense, convolutional, and RNNs in TensorFlow.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`hipCUB rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
- Supports operations like ``tf.reduce_sum``, ``tf.cumsum``, ``tf.sort``
|
||||
and other tensor operations in TensorFlow, especially those involving
|
||||
scanning, sorting, and filtering.
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
|
||||
- 1.0.17
|
||||
- :version-ref:`hipFFT rocm_version`
|
||||
- Accelerates Fast Fourier Transforms (FFT) for signal processing tasks.
|
||||
- Used for operations like signal processing, image filtering, and
|
||||
certain types of neural networks requiring FFT-based transformations.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
|
||||
- 2.3.0
|
||||
- :version-ref:`hipSOLVER rocm_version`
|
||||
- Provides GPU-accelerated direct linear solvers for dense and sparse
|
||||
systems.
|
||||
- Optimizes linear algebra functions such as solving systems of linear
|
||||
equations, often used in optimization and training tasks.
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
|
||||
- 3.1.2
|
||||
- :version-ref:`hipSPARSE rocm_version`
|
||||
- Optimizes sparse matrix operations for efficient computations on sparse
|
||||
data.
|
||||
- Accelerates sparse matrix operations in models with sparse weight
|
||||
matrices or activations, commonly used in neural networks.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`MIOpen rocm_version`
|
||||
- Provides optimized deep learning primitives such as convolutions,
|
||||
pooling,
|
||||
normalization, and activation functions.
|
||||
@@ -172,13 +175,13 @@ library was introduced as a dependency.
|
||||
in TensorFlow for layers like ``tf.nn.conv2d``, ``tf.nn.relu``, and
|
||||
``tf.nn.lstm_cell``.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`_
|
||||
- 2.21.5
|
||||
- :version-ref:`RCCL rocm_version`
|
||||
- Optimizes for multi-GPU communication for operations like AllReduce and
|
||||
Broadcast.
|
||||
- Distributed data parallel training (``tf.distribute.MirroredStrategy``).
|
||||
Handles communication in multi-GPU setups.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||
- 3.3.0
|
||||
- :version-ref:`rocThrust rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
- Reduction operations like ``tf.reduce_sum``, ``tf.cumsum`` for computing
|
||||
|
||||
@@ -32,7 +32,7 @@ architecture.
|
||||
|
||||
* [AMD Instinct™ MI250 microarchitecture](./gpu-arch/mi250.md)
|
||||
* [AMD Instinct MI200/CDNA2 ISA](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf)
|
||||
* [White paper](https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf)
|
||||
* [White paper](https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf)
|
||||
* [Performance counters](./gpu-arch/mi300-mi200-performance-counters.rst)
|
||||
|
||||
:::
|
||||
@@ -45,7 +45,7 @@ architecture.
|
||||
|
||||
* [AMD Instinct™ MI100 microarchitecture](./gpu-arch/mi100.md)
|
||||
* [AMD Instinct MI100/CDNA1 ISA](https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf)
|
||||
* [White paper](https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf)
|
||||
* [White paper](https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna-white-paper.pdf)
|
||||
|
||||
:::
|
||||
|
||||
@@ -55,7 +55,6 @@ architecture.
|
||||
* [AMD RDNA3 ISA](https://www.amd.com/system/files/TechDocs/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf)
|
||||
* [AMD RDNA2 ISA](https://www.amd.com/system/files/TechDocs/rdna2-shader-instruction-set-architecture.pdf)
|
||||
* [AMD RDNA ISA](https://www.amd.com/system/files/TechDocs/rdna-shader-instruction-set-architecture.pdf)
|
||||
* [AMD RDNA Architecture White Paper](https://www.amd.com/system/files/documents/rdna-whitepaper.pdf)
|
||||
|
||||
:::
|
||||
|
||||
|
||||
21
docs/conf.py
21
docs/conf.py
@@ -6,6 +6,8 @@
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
shutil.copy2("../RELEASE.md", "./about/release-notes.md")
|
||||
|
||||
@@ -49,6 +51,9 @@ article_pages = [
|
||||
|
||||
{"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
|
||||
@@ -63,7 +68,7 @@ article_pages = [
|
||||
{"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/vllm-benchmark", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
|
||||
|
||||
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/model-quantization", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries", "os": ["linux"]},
|
||||
@@ -86,11 +91,16 @@ article_pages = [
|
||||
|
||||
external_toc_path = "./sphinx/_toc.yml"
|
||||
|
||||
extensions = ["rocm_docs", "sphinx_reredirects", "sphinx_sitemap"]
|
||||
# Add the _extensions directory to Python's search path
|
||||
sys.path.append(str(Path(__file__).parent / 'extension'))
|
||||
|
||||
extensions = ["rocm_docs", "sphinx_reredirects", "sphinx_sitemap", "sphinxcontrib.datatemplates", "version-ref"]
|
||||
|
||||
compatibility_matrix_file = str(Path(__file__).parent / 'compatibility/compatibility-matrix-historical-6.0.csv')
|
||||
|
||||
external_projects_current_project = "rocm"
|
||||
|
||||
# Uncomment if facing rate limit exceed issue with local build
|
||||
# Uncomment if facing rate limit exceed issue with local build
|
||||
# external_projects_remote_repository = ""
|
||||
|
||||
html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "https://rocm-stg.amd.com/")
|
||||
@@ -101,8 +111,9 @@ if os.environ.get("READTHEDOCS", "") == "True":
|
||||
html_theme = "rocm_docs_theme"
|
||||
html_theme_options = {"flavor": "rocm-docs-home"}
|
||||
|
||||
html_static_path = ["sphinx/static/css"]
|
||||
html_css_files = ["rocm_custom.css", "rocm_rn.css"]
|
||||
html_static_path = ["sphinx/static/css", "extension/how-to/rocm-for-ai/inference"]
|
||||
html_css_files = ["rocm_custom.css", "rocm_rn.css", "vllm-benchmark.css"]
|
||||
html_js_files = ["vllm-benchmark.js"]
|
||||
|
||||
html_title = "ROCm Documentation"
|
||||
|
||||
|
||||
@@ -0,0 +1,153 @@
|
||||
vllm_benchmark:
|
||||
unified_docker:
|
||||
latest:
|
||||
pull_tag: rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9
|
||||
rocm_version: 6.3.1
|
||||
vllm_version: 0.6.6
|
||||
pytorch_version: 2.7.0 (2.7.0a0+git3a58512)
|
||||
model_groups:
|
||||
- group: Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_vllm_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.2 11B Vision
|
||||
mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
|
||||
model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
|
||||
precision: float16
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_vllm_llama-2-7b
|
||||
model_repo: meta-llama/Llama-2-7b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 3.1 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Mistral
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mistral 7B
|
||||
mad_tag: pyt_vllm_mistral-7b
|
||||
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
||||
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mistral 7B FP8
|
||||
mad_tag: pyt_vllm_mistral-7b_fp8
|
||||
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen2 7B
|
||||
mad_tag: pyt_vllm_qwen2-7b
|
||||
model_repo: Qwen/Qwen2-7B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
||||
precision: float16
|
||||
- model: Qwen2 72B
|
||||
mad_tag: pyt_vllm_qwen2-72b
|
||||
model_repo: Qwen/Qwen2-72B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
||||
precision: float16
|
||||
- group: JAIS
|
||||
tag: jais
|
||||
models:
|
||||
- model: JAIS 13B
|
||||
mad_tag: pyt_vllm_jais-13b
|
||||
model_repo: core42/jais-13b-chat
|
||||
url: https://huggingface.co/core42/jais-13b-chat
|
||||
precision: float16
|
||||
- model: JAIS 30B
|
||||
mad_tag: pyt_vllm_jais-30b
|
||||
model_repo: core42/jais-30b-chat-v3
|
||||
url: https://huggingface.co/core42/jais-30b-chat-v3
|
||||
precision: float16
|
||||
- group: DBRX
|
||||
tag: dbrx
|
||||
models:
|
||||
- model: DBRX Instruct
|
||||
mad_tag: pyt_vllm_dbrx-instruct
|
||||
model_repo: databricks/dbrx-instruct
|
||||
url: https://huggingface.co/databricks/dbrx-instruct
|
||||
precision: float16
|
||||
- model: DBRX Instruct FP8
|
||||
mad_tag: pyt_vllm_dbrx_fp8
|
||||
model_repo: amd/dbrx-instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Gemma
|
||||
tag: gemma
|
||||
models:
|
||||
- model: Gemma 2 27B
|
||||
mad_tag: pyt_vllm_gemma-2-27b
|
||||
model_repo: google/gemma-2-27b
|
||||
url: https://huggingface.co/google/gemma-2-27b
|
||||
precision: float16
|
||||
- group: Cohere
|
||||
tag: cohere
|
||||
models:
|
||||
- model: C4AI Command R+ 08-2024
|
||||
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
||||
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
||||
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
||||
precision: float16
|
||||
- model: C4AI Command R+ 08-2024 FP8
|
||||
mad_tag: pyt_vllm_command-r-plus_fp8
|
||||
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
||||
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
||||
precision: float8
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek MoE 16B
|
||||
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
||||
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
||||
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
||||
precision: float16
|
||||
0
docs/extension/__init__.py
Normal file
0
docs/extension/__init__.py
Normal file
212
docs/extension/how-to/rocm-for-ai/inference/vllm-benchmark.js
Normal file
212
docs/extension/how-to/rocm-for-ai/inference/vllm-benchmark.js
Normal file
@@ -0,0 +1,212 @@
|
||||
function ready(proc) {
|
||||
// Check if page is loaded. If so, init.
|
||||
if (document.readyState !== "loading") {
|
||||
proc();
|
||||
} else {
|
||||
// Otherwise, wait for DOMContentLoaded event.
|
||||
document.addEventListener("DOMContentLoaded", proc);
|
||||
}
|
||||
}
|
||||
|
||||
ready(() => {
|
||||
const ModelPicker = {
|
||||
// Selector strings for DOM elements
|
||||
SELECTORS: {
|
||||
CONTAINER: "#vllm-benchmark-ud-params-picker",
|
||||
MODEL_GROUP_BTN: 'div[data-param-k="model-group"][data-param-v]',
|
||||
MODEL_PARAM_BTN: 'div[data-param-k="model"][data-param-v]',
|
||||
MODEL_DOC: "div.model-doc",
|
||||
},
|
||||
CSS_CLASSES: {
|
||||
HIDDEN: "hidden",
|
||||
},
|
||||
ATTRIBUTES: {
|
||||
PARAM_KEY: "data-param-k", // URL search parameter key (i.e., "model")
|
||||
PARAM_VALUE: "data-param-v", // URL search param value (e.g., "pyt_vllm_llama-3.1-8b", "pyt_vllm_llama-3.1-70b") -- these are MAD model tags
|
||||
PARAM_GROUP: "data-param-group", // Model group (e.g., "llama", "mistral")
|
||||
PARAM_STATE: "data-param-state", // Selection state
|
||||
},
|
||||
|
||||
// Cache DOM elements
|
||||
elements: {
|
||||
container: null,
|
||||
modelGroups: null,
|
||||
modelParams: null,
|
||||
modelDocs: null,
|
||||
},
|
||||
|
||||
data: {
|
||||
availableModels: new Set(),
|
||||
modelsByGroup: new Map(),
|
||||
modelToGroupMap: new Map(),
|
||||
formattedModelClassMap: new Map(), //TODO
|
||||
},
|
||||
|
||||
init() {
|
||||
this.elements.container = document.querySelector(
|
||||
this.SELECTORS.CONTAINER,
|
||||
);
|
||||
if (!this.elements.container) return;
|
||||
|
||||
this.cacheDOMElements();
|
||||
if (!this.validateElements()) return;
|
||||
|
||||
this.buildModelData();
|
||||
this.bindEvents();
|
||||
this.initializeState();
|
||||
},
|
||||
|
||||
cacheDOMElements() {
|
||||
const { CONTAINER, MODEL_GROUP_BTN, MODEL_PARAM_BTN, MODEL_DOC } =
|
||||
this.SELECTORS;
|
||||
this.elements = {
|
||||
container: document.querySelector(CONTAINER),
|
||||
modelGroups: document.querySelectorAll(MODEL_GROUP_BTN),
|
||||
modelParams: document.querySelectorAll(MODEL_PARAM_BTN),
|
||||
modelDocs: document.querySelectorAll(MODEL_DOC),
|
||||
};
|
||||
},
|
||||
|
||||
validateElements() {
|
||||
const { modelGroups, modelParams } = this.elements;
|
||||
if (!modelGroups.length || !modelParams.length) {
|
||||
console.warn("Model picker is missing required elements");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
},
|
||||
|
||||
buildModelData() {
|
||||
const { PARAM_VALUE, PARAM_GROUP } = this.ATTRIBUTES;
|
||||
|
||||
this.elements.modelParams.forEach((model) => {
|
||||
const modelTag = model.getAttribute(PARAM_VALUE);
|
||||
const groupTag = model.getAttribute(PARAM_GROUP);
|
||||
|
||||
if (!modelTag || !groupTag) return;
|
||||
|
||||
this.data.availableModels.add(modelTag);
|
||||
this.data.modelToGroupMap.set(modelTag, groupTag);
|
||||
|
||||
// FIXME: this is because Sphinx auto-formats class names to use dashes
|
||||
this.data.formattedModelClassMap.set(
|
||||
modelTag,
|
||||
modelTag.replace(/[^a-zA-Z0-9]/g, "-"),
|
||||
);
|
||||
|
||||
if (!this.data.modelsByGroup.has(groupTag)) {
|
||||
this.data.modelsByGroup.set(groupTag, []);
|
||||
}
|
||||
this.data.modelsByGroup.get(groupTag).push(modelTag);
|
||||
});
|
||||
},
|
||||
|
||||
// Event listeners for user interactions
|
||||
bindEvents() {
|
||||
const handleInteraction = (event) => {
|
||||
const target = event.target.closest(`[${this.ATTRIBUTES.PARAM_KEY}]`);
|
||||
if (!target) return;
|
||||
|
||||
const paramType = target.getAttribute(this.ATTRIBUTES.PARAM_KEY);
|
||||
const paramValue = target.getAttribute(this.ATTRIBUTES.PARAM_VALUE);
|
||||
|
||||
if (paramType === "model") {
|
||||
const groupTag = target.getAttribute(this.ATTRIBUTES.PARAM_GROUP);
|
||||
if (groupTag) this.updateUI(paramValue, groupTag);
|
||||
} else if (paramType === "model-group") {
|
||||
const firstModelInGroup = this.data.modelsByGroup.get(paramValue)
|
||||
?.[0];
|
||||
if (firstModelInGroup) this.updateUI(firstModelInGroup, paramValue);
|
||||
}
|
||||
};
|
||||
|
||||
this.elements.container.addEventListener("click", handleInteraction);
|
||||
this.elements.container.addEventListener("keydown", (event) => {
|
||||
if (event.key === "Enter" || event.key === " ") {
|
||||
event.preventDefault();
|
||||
handleInteraction(event);
|
||||
}
|
||||
});
|
||||
},
|
||||
|
||||
// Update the page based on the selected model
|
||||
updateUI(modelTag, groupTag) {
|
||||
const validModel = this.setModelSearchParam(modelTag);
|
||||
|
||||
// Update model group buttons
|
||||
this.elements.modelGroups.forEach((group) => {
|
||||
const isSelected =
|
||||
group.getAttribute(this.ATTRIBUTES.PARAM_VALUE) === groupTag;
|
||||
group.setAttribute(
|
||||
this.ATTRIBUTES.PARAM_STATE,
|
||||
isSelected ? "selected" : "",
|
||||
);
|
||||
group.setAttribute("aria-selected", isSelected.toString());
|
||||
});
|
||||
|
||||
// Update model buttons
|
||||
this.elements.modelParams.forEach((model) => {
|
||||
const isInSelectedGroup =
|
||||
model.getAttribute(this.ATTRIBUTES.PARAM_GROUP) === groupTag;
|
||||
const isSelectedModel =
|
||||
model.getAttribute(this.ATTRIBUTES.PARAM_VALUE) === validModel;
|
||||
|
||||
model.classList.toggle(this.CSS_CLASSES.HIDDEN, !isInSelectedGroup);
|
||||
model.setAttribute(
|
||||
this.ATTRIBUTES.PARAM_STATE,
|
||||
isSelectedModel ? "selected" : "",
|
||||
);
|
||||
model.setAttribute("aria-selected", isSelectedModel.toString());
|
||||
});
|
||||
|
||||
// Update visibility of doc sections
|
||||
const formattedClass = this.data.formattedModelClassMap.get(validModel);
|
||||
if (formattedClass) {
|
||||
this.elements.modelDocs.forEach((doc) => {
|
||||
doc.classList.toggle(
|
||||
this.CSS_CLASSES.HIDDEN,
|
||||
!doc.classList.contains(formattedClass),
|
||||
);
|
||||
});
|
||||
}
|
||||
},
|
||||
|
||||
// Get the current model from the URL search parameters.
|
||||
getModelSearchParam() {
|
||||
return new URLSearchParams(location.search).get("model");
|
||||
},
|
||||
|
||||
// Set the model in the URL search parameters, or fallback to the first available one.
|
||||
setModelSearchParam(modelTag) {
|
||||
const defaultModel = [...this.data.availableModels][0];
|
||||
const model = this.data.availableModels.has(modelTag)
|
||||
? modelTag
|
||||
: defaultModel;
|
||||
|
||||
const searchParams = new URLSearchParams(location.search);
|
||||
searchParams.set("model", model);
|
||||
|
||||
history.replaceState(
|
||||
{},
|
||||
"",
|
||||
`${location.pathname}?${searchParams.toString()}`,
|
||||
);
|
||||
return model;
|
||||
},
|
||||
|
||||
// Initialize the UI state based on the current URL search parameter or default values.
|
||||
initializeState() {
|
||||
const currentModel = this.getModelSearchParam();
|
||||
const validModel = this.setModelSearchParam(currentModel);
|
||||
|
||||
const initialGroup = this.data.modelToGroupMap.get(validModel) ??
|
||||
[...this.data.modelsByGroup.keys()][0];
|
||||
|
||||
if (initialGroup) {
|
||||
this.updateUI(validModel, initialGroup);
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
ModelPicker.init();
|
||||
});
|
||||
266
docs/extension/version-ref.py
Normal file
266
docs/extension/version-ref.py
Normal file
@@ -0,0 +1,266 @@
|
||||
from docutils import nodes
|
||||
from docutils.parsers.rst import Directive
|
||||
from sphinx.util import logging
|
||||
import csv
|
||||
from io import StringIO
|
||||
import re
|
||||
import shlex
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class VersionReference(nodes.Inline, nodes.TextElement):
|
||||
"""Represents an inline version reference."""
|
||||
pass
|
||||
|
||||
class VersionSetDirective(Directive):
|
||||
"""Directive for setting version references within a page scope."""
|
||||
required_arguments = 2 # name and value
|
||||
optional_arguments = 0
|
||||
|
||||
def run(self):
|
||||
env = self.state.document.settings.env
|
||||
if not hasattr(env, 'doc_version_refs'):
|
||||
env.doc_version_refs = {}
|
||||
current_doc = env.docname
|
||||
if current_doc not in env.doc_version_refs:
|
||||
env.doc_version_refs[current_doc] = {}
|
||||
|
||||
name, value = self.arguments
|
||||
if name.lower() == 'latest':
|
||||
logger.warning('Cannot override the "latest" keyword with version-set')
|
||||
return []
|
||||
|
||||
# Handle 'latest' value by getting the actual version
|
||||
if value.lower() == 'latest':
|
||||
data = getattr(env, 'compatibility_matrix', None)
|
||||
if data:
|
||||
latest_version = get_latest_rocm_version(data)
|
||||
if latest_version:
|
||||
value = latest_version
|
||||
|
||||
env.doc_version_refs[current_doc][name] = value
|
||||
return []
|
||||
|
||||
def clean_library_name(name):
|
||||
"""Extract library name from RST formatting."""
|
||||
# Handle :doc: format
|
||||
doc_match = re.search(r':doc:`([^<]+)(?:\s+<[^>]+>)?`', name)
|
||||
if doc_match:
|
||||
return doc_match.group(1).strip()
|
||||
|
||||
# Handle other link formats
|
||||
link_match = re.search(r'`([^<]+)(?:\s+<[^>]+>)?`_?', name)
|
||||
if link_match:
|
||||
return link_match.group(1).strip()
|
||||
|
||||
return name.strip()
|
||||
|
||||
def get_latest_rocm_version(data):
|
||||
"""Get the latest ROCm version from the matrix headers."""
|
||||
if not data or len(data) == 0:
|
||||
return None
|
||||
|
||||
# Get all column names except 'ROCm Version'
|
||||
columns = [col for col in data[0].keys() if col != 'ROCm Version']
|
||||
# Return the first column name (assumed to be the latest version)
|
||||
return columns[0] if columns else None
|
||||
|
||||
def version_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
||||
"""
|
||||
Role function to print version value.
|
||||
Usage: :version:`version_name`
|
||||
"""
|
||||
try:
|
||||
version_name = text.strip()
|
||||
env = inliner.document.settings.env
|
||||
|
||||
if hasattr(env, 'doc_version_refs'):
|
||||
current_doc = env.docname
|
||||
if current_doc in env.doc_version_refs:
|
||||
doc_refs = env.doc_version_refs[current_doc]
|
||||
if version_name in doc_refs:
|
||||
version = doc_refs[version_name]
|
||||
node = nodes.Text(version)
|
||||
return [node], []
|
||||
|
||||
msg = inliner.reporter.warning(
|
||||
f'No version defined for name {version_name}',
|
||||
line=lineno
|
||||
)
|
||||
return [], [msg]
|
||||
|
||||
except Exception as e:
|
||||
msg = inliner.reporter.error(
|
||||
f'Error looking up version: {str(e)}',
|
||||
line=lineno
|
||||
)
|
||||
prb = inliner.problematic(rawtext, rawtext, msg)
|
||||
return [prb], [msg]
|
||||
|
||||
def version_ref_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
||||
"""
|
||||
Role function for version references.
|
||||
Usage: :version-ref:`library_name release`
|
||||
:version-ref:`"library name" release`
|
||||
:version-ref:`library_name latest`
|
||||
:version-ref:`rocm latest`
|
||||
"""
|
||||
try:
|
||||
# Parse the text - handle both quoted and unquoted formats
|
||||
if '"' in text:
|
||||
parts = shlex.split(text)
|
||||
else:
|
||||
parts = text.split()
|
||||
|
||||
if len(parts) != 2:
|
||||
msg = inliner.reporter.error(
|
||||
'Version reference must be in format "library_name release" or "\\"library name\\" release"',
|
||||
line=lineno
|
||||
)
|
||||
prb = inliner.problematic(rawtext, rawtext, msg)
|
||||
return [prb], [msg]
|
||||
|
||||
library_name, release = parts
|
||||
env = inliner.document.settings.env
|
||||
|
||||
# Check if release is a version reference in current document
|
||||
if hasattr(env, 'doc_version_refs'):
|
||||
current_doc = env.docname
|
||||
if current_doc in env.doc_version_refs:
|
||||
doc_refs = env.doc_version_refs[current_doc]
|
||||
if release in doc_refs:
|
||||
release = doc_refs[release]
|
||||
|
||||
# Handle special case for "rocm latest"
|
||||
if library_name.lower() == 'rocm' and release.lower() == 'latest':
|
||||
data = getattr(env, 'compatibility_matrix', None)
|
||||
if not data:
|
||||
raise ValueError("Compatibility matrix not found in environment")
|
||||
|
||||
latest_version = get_latest_rocm_version(data)
|
||||
if latest_version:
|
||||
node = VersionReference()
|
||||
node += nodes.Text(latest_version)
|
||||
return [node], []
|
||||
else:
|
||||
msg = inliner.reporter.warning(
|
||||
'No ROCm versions found in compatibility matrix',
|
||||
line=lineno
|
||||
)
|
||||
return [], [msg]
|
||||
|
||||
version = lookup_version(inliner, library_name, release)
|
||||
|
||||
if version:
|
||||
node = VersionReference()
|
||||
node += nodes.Text(version)
|
||||
return [node], []
|
||||
else:
|
||||
msg = inliner.reporter.warning(
|
||||
f'No version found for library {library_name} in release {release}',
|
||||
line=lineno
|
||||
)
|
||||
return [], [msg]
|
||||
|
||||
except Exception as e:
|
||||
msg = inliner.reporter.error(
|
||||
f'Error looking up version: {str(e)}',
|
||||
line=lineno
|
||||
)
|
||||
prb = inliner.problematic(rawtext, rawtext, msg)
|
||||
return [prb], [msg]
|
||||
|
||||
def lookup_version(inliner, library_name, release):
|
||||
"""Look up the version in the compatibility matrix."""
|
||||
env = inliner.document.settings.env
|
||||
data = getattr(env, 'compatibility_matrix', None)
|
||||
|
||||
if not data:
|
||||
raise ValueError("Compatibility matrix not found in environment")
|
||||
|
||||
# Handle the 'latest' keyword
|
||||
if release.lower() == 'latest':
|
||||
latest_version = get_latest_rocm_version(data)
|
||||
if not latest_version:
|
||||
return None
|
||||
release = latest_version
|
||||
|
||||
# For ROCm, check if the version exists in column headers
|
||||
if library_name.lower() == 'rocm':
|
||||
columns = [col for col in data[0].keys() if col != 'ROCm Version']
|
||||
if release in columns:
|
||||
return release
|
||||
return None
|
||||
|
||||
# Find the library version
|
||||
for row in data:
|
||||
row_lib_name = clean_library_name(row['ROCm Version'])
|
||||
if row_lib_name == library_name:
|
||||
# Get the version, removing any whitespace
|
||||
version = row.get(release, '').strip()
|
||||
if version:
|
||||
return version
|
||||
|
||||
# If not found, try a case-insensitive search
|
||||
for row in data:
|
||||
row_lib_name = clean_library_name(row['ROCm Version'])
|
||||
if row_lib_name.lower() == library_name.lower():
|
||||
version = row.get(release, '').strip()
|
||||
if version:
|
||||
return version
|
||||
|
||||
return None
|
||||
|
||||
def visit_version_reference(self, node):
|
||||
self.body.append(f'<span class="version-reference">')
|
||||
|
||||
def depart_version_reference(self, node):
|
||||
self.body.append('</span>')
|
||||
|
||||
def load_compatibility_matrix(app):
|
||||
"""Load the compatibility matrix content from CSV."""
|
||||
if not app.config.compatibility_matrix_file:
|
||||
logger.warning('No compatibility matrix file configured')
|
||||
return
|
||||
|
||||
try:
|
||||
with open(app.config.compatibility_matrix_file, 'r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
app.env.compatibility_matrix = list(reader)
|
||||
logger.info('Successfully loaded compatibility matrix')
|
||||
|
||||
# Debug: print first few rows with their library names
|
||||
for row in list(app.env.compatibility_matrix)[:5]:
|
||||
if 'ROCm Version' in row:
|
||||
lib_name = clean_library_name(row['ROCm Version'])
|
||||
logger.debug(f"Loaded library: {lib_name}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'Error loading compatibility matrix: {str(e)}')
|
||||
|
||||
def purge_version_refs(app, env, docname):
|
||||
"""Remove version references for a document when it is purged"""
|
||||
if hasattr(env, 'doc_version_refs'):
|
||||
if docname in env.doc_version_refs:
|
||||
del env.doc_version_refs[docname]
|
||||
|
||||
def setup(app):
|
||||
app.add_node(VersionReference,
|
||||
html=(visit_version_reference, depart_version_reference))
|
||||
app.add_role('version-ref', version_ref_role)
|
||||
app.add_role('version', version_role)
|
||||
app.add_directive('version-set', VersionSetDirective)
|
||||
|
||||
# Add a config value for the compatibility matrix file path
|
||||
app.add_config_value('compatibility_matrix_file', None, 'env')
|
||||
|
||||
# Connect to the builder-inited event to load the matrix
|
||||
app.connect('builder-inited', load_compatibility_matrix)
|
||||
|
||||
# Connect to env-purge-doc event to clean up document-specific version refs
|
||||
app.connect('env-purge-doc', purge_version_refs)
|
||||
|
||||
return {
|
||||
'parallel_read_safe': True,
|
||||
'parallel_write_safe': True,
|
||||
}
|
||||
@@ -9,422 +9,266 @@ LLM inference performance validation on AMD Instinct MI300X
|
||||
|
||||
.. _vllm-benchmark-unified-docker:
|
||||
|
||||
The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
|
||||
a prebuilt, optimized environment for validating large language model (LLM)
|
||||
inference performance on the AMD Instinct™ MI300X accelerator. This ROCm vLLM
|
||||
Docker image integrates vLLM and PyTorch tailored specifically for the MI300X
|
||||
accelerator and includes the following components:
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||
|
||||
* `ROCm 6.3.1 <https://github.com/ROCm/ROCm>`_
|
||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||
|
||||
* `vLLM 0.6.6 <https://docs.vllm.ai/en/latest>`_
|
||||
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
|
||||
a prebuilt, optimized environment for validating large language model (LLM)
|
||||
inference performance on the AMD Instinct™ MI300X accelerator. This ROCm vLLM
|
||||
Docker image integrates vLLM and PyTorch tailored specifically for the MI300X
|
||||
accelerator and includes the following components:
|
||||
|
||||
* `PyTorch 2.7.0 (2.7.0a0+git3a58512) <https://github.com/pytorch/pytorch>`_
|
||||
* `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
|
||||
|
||||
With this Docker image, you can quickly validate the expected inference
|
||||
performance numbers for the MI300X accelerator. This topic also provides tips on
|
||||
optimizing performance with popular AI models. For more information, see the lists of
|
||||
:ref:`available models for MAD-integrated benchmarking <vllm-benchmark-mad-models>`
|
||||
and :ref:`standalone benchmarking <vllm-benchmark-standalone-options>`.
|
||||
* `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
|
||||
|
||||
.. _vllm-benchmark-vllm:
|
||||
* `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_
|
||||
|
||||
.. note::
|
||||
With this Docker image, you can quickly validate the expected inference
|
||||
performance numbers for the MI300X accelerator. This topic also provides tips on
|
||||
optimizing performance with popular AI models.
|
||||
|
||||
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||
more information.
|
||||
.. _vllm-benchmark-available-models:
|
||||
|
||||
Getting started
|
||||
===============
|
||||
Available models
|
||||
================
|
||||
|
||||
Use the following procedures to reproduce the benchmark results on an
|
||||
MI300X accelerator with the prebuilt vLLM Docker image.
|
||||
.. raw:: html
|
||||
|
||||
.. _vllm-benchmark-get-started:
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row">
|
||||
<div class="col-2 me-2 model-param-head">Model</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
1. Disable NUMA auto-balancing.
|
||||
<div class="row mt-1">
|
||||
<div class="col-2 me-2 model-param-head">Model variant</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||
might hang until the periodic balancing is finalized. For more information,
|
||||
see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
|
||||
.. _vllm-benchmark-vllm:
|
||||
|
||||
.. code-block:: shell
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
# disable automatic NUMA balancing
|
||||
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||
cat /proc/sys/kernel/numa_balancing
|
||||
0
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
|
||||
.. note::
|
||||
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||
|
||||
.. code-block:: shell
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
|
||||
|
||||
Once the setup is complete, choose between two options to reproduce the
|
||||
benchmark results:
|
||||
.. note::
|
||||
|
||||
- :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
|
||||
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||
more information.
|
||||
|
||||
- :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
|
||||
Getting started
|
||||
===============
|
||||
|
||||
.. _vllm-benchmark-mad:
|
||||
Use the following procedures to reproduce the benchmark results on an
|
||||
MI300X accelerator with the prebuilt vLLM Docker image.
|
||||
|
||||
MAD-integrated benchmarking
|
||||
===========================
|
||||
.. _vllm-benchmark-get-started:
|
||||
|
||||
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
1. Disable NUMA auto-balancing.
|
||||
|
||||
.. code-block:: shell
|
||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||
might hang until the periodic balancing is finalized. For more information,
|
||||
see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
.. code-block:: shell
|
||||
|
||||
Use this command to run a performance benchmark test of the Llama 3.1 8B model
|
||||
on one GPU with ``float16`` data type in the host machine.
|
||||
# disable automatic NUMA balancing
|
||||
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||
cat /proc/sys/kernel/numa_balancing
|
||||
0
|
||||
|
||||
.. code-block:: shell
|
||||
2. Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
ROCm MAD launches a Docker container with the name
|
||||
``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
|
||||
model are collected in the following path: ``~/MAD/reports_float16/``.
|
||||
.. code-block:: shell
|
||||
|
||||
Although the following models are preconfigured to collect latency and
|
||||
throughput performance data, you can also change the benchmarking parameters.
|
||||
Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
|
||||
docker pull {{ unified_docker.pull_tag }}
|
||||
|
||||
.. _vllm-benchmark-mad-models:
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
Available models
|
||||
----------------
|
||||
Once the setup is complete, choose between two options to reproduce the
|
||||
benchmark results:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: 2, 3
|
||||
.. _vllm-benchmark-mad:
|
||||
|
||||
* - Model name
|
||||
- Tag
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
* - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B>`_
|
||||
- ``pyt_vllm_llama-3.1-8b``
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
* - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
|
||||
- ``pyt_vllm_llama-3.1-70b``
|
||||
.. tab-set::
|
||||
|
||||
* - `Llama 3.1 405B <https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct>`_
|
||||
- ``pyt_vllm_llama-3.1-405b``
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
* - `Llama 3.2 11B Vision <https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct>`_
|
||||
- ``pyt_vllm_llama-3.2-11b-vision-instruct``
|
||||
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
* - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`_
|
||||
- ``pyt_vllm_llama-2-7b``
|
||||
.. code-block:: shell
|
||||
|
||||
* - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`_
|
||||
- ``pyt_vllm_llama-2-70b``
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
* - `Mixtral MoE 8x7B <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`_
|
||||
- ``pyt_vllm_mixtral-8x7b``
|
||||
Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
||||
using one GPU with the ``{{model.precision}}`` data type on the host machine.
|
||||
|
||||
* - `Mixtral MoE 8x22B <https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1>`_
|
||||
- ``pyt_vllm_mixtral-8x22b``
|
||||
.. code-block:: shell
|
||||
|
||||
* - `Mistral 7B <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`_
|
||||
- ``pyt_vllm_mistral-7b``
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
|
||||
|
||||
* - `Qwen2 7B <https://huggingface.co/Qwen/Qwen2-7B-Instruct>`_
|
||||
- ``pyt_vllm_qwen2-7b``
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
||||
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
||||
|
||||
* - `Qwen2 72B <https://huggingface.co/Qwen/Qwen2-72B-Instruct>`_
|
||||
- ``pyt_vllm_qwen2-72b``
|
||||
Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
|
||||
to collect latency and throughput performance data, you can also change the benchmarking
|
||||
parameters. See the standalone benchmarking tab for more information.
|
||||
|
||||
* - `JAIS 13B <https://huggingface.co/core42/jais-13b-chat>`_
|
||||
- ``pyt_vllm_jais-13b``
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
* - `JAIS 30B <https://huggingface.co/core42/jais-30b-chat-v3>`_
|
||||
- ``pyt_vllm_jais-30b``
|
||||
Run the vLLM benchmark tool independently by starting the
|
||||
`Docker container <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`_
|
||||
as shown in the following snippet.
|
||||
|
||||
* - `DBRX Instruct <https://huggingface.co/databricks/dbrx-instruct>`_
|
||||
- ``pyt_vllm_dbrx-instruct``
|
||||
.. code-block::
|
||||
|
||||
* - `Gemma 2 27B <https://huggingface.co/google/gemma-2-27b>`_
|
||||
- ``pyt_vllm_gemma-2-27b``
|
||||
docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
|
||||
docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.6 rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
|
||||
|
||||
* - `C4AI Command R+ 08-2024 <https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024>`_
|
||||
- ``pyt_vllm_c4ai-command-r-plus-08-2024``
|
||||
In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||
|
||||
* - `DeepSeek MoE 16B <https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat>`_
|
||||
- ``pyt_vllm_deepseek-moe-16b-chat``
|
||||
.. code-block::
|
||||
|
||||
* - `Llama 3.1 70B FP8 <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`_
|
||||
- ``pyt_vllm_llama-3.1-70b_fp8``
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/vllm
|
||||
|
||||
* - `Llama 3.1 405B FP8 <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`_
|
||||
- ``pyt_vllm_llama-3.1-405b_fp8``
|
||||
To start the benchmark, use the following command with the appropriate options.
|
||||
|
||||
* - `Mixtral MoE 8x7B FP8 <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`_
|
||||
- ``pyt_vllm_mixtral-8x7b_fp8``
|
||||
.. code-block::
|
||||
|
||||
* - `Mixtral MoE 8x22B FP8 <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`_
|
||||
- ``pyt_vllm_mixtral-8x22b_fp8``
|
||||
./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
|
||||
|
||||
* - `Mistral 7B FP8 <https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV>`_
|
||||
- ``pyt_vllm_mistral-7b_fp8``
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:align: center
|
||||
|
||||
* - `DBRX Instruct FP8 <https://huggingface.co/amd/dbrx-instruct-FP8-KV>`_
|
||||
- ``pyt_vllm_dbrx_fp8``
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
* - `C4AI Command R+ 08-2024 FP8 <https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV>`_
|
||||
- ``pyt_vllm_command-r-plus_fp8``
|
||||
* - ``$test_option``
|
||||
- latency
|
||||
- Measure decoding token latency
|
||||
|
||||
.. _vllm-benchmark-standalone:
|
||||
* -
|
||||
- throughput
|
||||
- Measure token generation throughput
|
||||
|
||||
Standalone benchmarking
|
||||
=======================
|
||||
* -
|
||||
- all
|
||||
- Measure both throughput and latency
|
||||
|
||||
You can run the vLLM benchmark tool independently by starting the
|
||||
`Docker container <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`_
|
||||
as shown in the following snippet.
|
||||
* - ``$num_gpu``
|
||||
- 1 or 8
|
||||
- Number of GPUs
|
||||
|
||||
.. code-block::
|
||||
* - ``$datatype``
|
||||
- ``float16`` or ``float8``
|
||||
- Data type
|
||||
|
||||
docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
|
||||
docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.6 rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
|
||||
.. note::
|
||||
|
||||
In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||
already configured. You don't need to specify them with this script.
|
||||
|
||||
.. code-block::
|
||||
.. note::
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/vllm
|
||||
If you encounter the following error, pass your access-authorized Hugging
|
||||
Face token to the gated models.
|
||||
|
||||
Command
|
||||
-------
|
||||
.. code-block::
|
||||
|
||||
To start the benchmark, use the following command with the appropriate options.
|
||||
See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
|
||||
options and their descriptions.
|
||||
OSError: You are trying to access a gated repo.
|
||||
|
||||
.. code-block:: shell
|
||||
# pass your HF_TOKEN
|
||||
export HF_TOKEN=$your_personal_hf_token
|
||||
|
||||
./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
|
||||
Here are some examples of running the benchmark with various options.
|
||||
|
||||
See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
|
||||
* Latency benchmark
|
||||
|
||||
.. note::
|
||||
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with the ``{{model.precision}}`` data type.
|
||||
|
||||
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||
already configured. You don't need to specify them with this script.
|
||||
.. code-block::
|
||||
|
||||
.. note::
|
||||
./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
|
||||
|
||||
If you encounter the following error, pass your access-authorized Hugging
|
||||
Face token to the gated models.
|
||||
Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
|
||||
|
||||
.. code-block:: shell
|
||||
* Throughput benchmark
|
||||
|
||||
OSError: You are trying to access a gated repo.
|
||||
Use this command to throughput the latency of the {{model.model}} model on eight GPUs with the ``{{model.precision}}`` data type.
|
||||
|
||||
# pass your HF_TOKEN
|
||||
export HF_TOKEN=$your_personal_hf_token
|
||||
.. code-block:: shell
|
||||
|
||||
.. _vllm-benchmark-standalone-options:
|
||||
./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
|
||||
|
||||
Options and available models
|
||||
----------------------------
|
||||
Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:align: center
|
||||
.. raw:: html
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
<style>
|
||||
mjx-container[jax="CHTML"][display="true"] {
|
||||
text-align: left;
|
||||
margin: 0;
|
||||
}
|
||||
</style>
|
||||
|
||||
* - ``$test_option``
|
||||
- latency
|
||||
- Measure decoding token latency
|
||||
.. note::
|
||||
|
||||
* -
|
||||
- throughput
|
||||
- Measure token generation throughput
|
||||
Throughput is calculated as:
|
||||
|
||||
* -
|
||||
- all
|
||||
- Measure both throughput and latency
|
||||
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||
|
||||
* - ``$model_repo``
|
||||
- ``meta-llama/Llama-3.1-8B-Instruct``
|
||||
- `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B>`_
|
||||
|
||||
* - (``float16``)
|
||||
- ``meta-llama/Llama-3.1-70B-Instruct``
|
||||
- `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
|
||||
|
||||
* -
|
||||
- ``meta-llama/Llama-3.1-405B-Instruct``
|
||||
- `Llama 3.1 405B <https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct>`_
|
||||
|
||||
* -
|
||||
- ``meta-llama/Llama-3.2-11B-Vision-Instruct``
|
||||
- `Llama 3.2 11B Vision <https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct>`_
|
||||
|
||||
* -
|
||||
- ``meta-llama/Llama-2-7b-chat-hf``
|
||||
- `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`_
|
||||
|
||||
* -
|
||||
- ``meta-llama/Llama-2-70b-chat-hf``
|
||||
- `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`_
|
||||
|
||||
* -
|
||||
- ``mistralai/Mixtral-8x7B-Instruct-v0.1``
|
||||
- `Mixtral MoE 8x7B <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`_
|
||||
|
||||
* -
|
||||
- ``mistralai/Mixtral-8x22B-Instruct-v0.1``
|
||||
- `Mixtral MoE 8x22B <https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1>`_
|
||||
|
||||
* -
|
||||
- ``mistralai/Mistral-7B-Instruct-v0.3``
|
||||
- `Mistral 7B <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`_
|
||||
|
||||
* -
|
||||
- ``Qwen/Qwen2-7B-Instruct``
|
||||
- `Qwen2 7B <https://huggingface.co/Qwen/Qwen2-7B-Instruct>`_
|
||||
|
||||
* -
|
||||
- ``Qwen/Qwen2-72B-Instruct``
|
||||
- `Qwen2 72B <https://huggingface.co/Qwen/Qwen2-72B-Instruct>`_
|
||||
|
||||
* -
|
||||
- ``core42/jais-13b-chat``
|
||||
- `JAIS 13B <https://huggingface.co/core42/jais-13b-chat>`_
|
||||
|
||||
* -
|
||||
- ``core42/jais-30b-chat-v3``
|
||||
- `JAIS 30B <https://huggingface.co/core42/jais-30b-chat-v3>`_
|
||||
|
||||
* -
|
||||
- ``databricks/dbrx-instruct``
|
||||
- `DBRX Instruct <https://huggingface.co/databricks/dbrx-instruct>`_
|
||||
|
||||
* -
|
||||
- ``google/gemma-2-27b``
|
||||
- `Gemma 2 27B <https://huggingface.co/google/gemma-2-27b>`_
|
||||
|
||||
* -
|
||||
- ``CohereForAI/c4ai-command-r-plus-08-2024``
|
||||
- `C4AI Command R+ 08-2024 <https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024>`_
|
||||
|
||||
* -
|
||||
- ``deepseek-ai/deepseek-moe-16b-chat``
|
||||
- `DeepSeek MoE 16B <https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat>`_
|
||||
|
||||
* - ``$model_repo``
|
||||
- ``amd/Llama-3.1-70B-Instruct-FP8-KV``
|
||||
- `Llama 3.1 70B FP8 <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`_
|
||||
|
||||
* - (``float8``)
|
||||
- ``amd/Llama-3.1-405B-Instruct-FP8-KV``
|
||||
- `Llama 3.1 405B FP8 <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`_
|
||||
|
||||
* -
|
||||
- ``amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV``
|
||||
- `Mixtral MoE 8x7B FP8 <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`_
|
||||
|
||||
* -
|
||||
- ``amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV``
|
||||
- `Mixtral MoE 8x22B FP8 <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`_
|
||||
|
||||
* -
|
||||
- ``amd/Mistral-7B-v0.1-FP8-KV``
|
||||
- `Mistral 7B FP8 <https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV>`_
|
||||
|
||||
* -
|
||||
- ``amd/dbrx-instruct-FP8-KV``
|
||||
- `DBRX Instruct FP8 <https://huggingface.co/amd/dbrx-instruct-FP8-KV>`_
|
||||
|
||||
* -
|
||||
- ``amd/c4ai-command-r-plus-FP8-KV``
|
||||
- `C4AI Command R+ 08-2024 FP8 <https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV>`_
|
||||
|
||||
* - ``$num_gpu``
|
||||
- 1 or 8
|
||||
- Number of GPUs
|
||||
|
||||
* - ``$datatype``
|
||||
- ``float16`` or ``float8``
|
||||
- Data type
|
||||
|
||||
.. _vllm-benchmark-run-benchmark:
|
||||
|
||||
Running the benchmark on the MI300X accelerator
|
||||
-----------------------------------------------
|
||||
|
||||
Here are some examples of running the benchmark with various options.
|
||||
See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
|
||||
options and their descriptions.
|
||||
|
||||
Example 1: latency benchmark
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Use this command to benchmark the latency of the Llama 3.1 70B model on eight GPUs with the ``float16`` and ``float8`` data types.
|
||||
|
||||
.. code-block::
|
||||
|
||||
./vllm_benchmark_report.sh -s latency -m meta-llama/Llama-3.1-70B-Instruct -g 8 -d float16
|
||||
./vllm_benchmark_report.sh -s latency -m amd/Llama-3.1-70B-Instruct-FP8-KV -g 8 -d float8
|
||||
|
||||
Find the latency reports at:
|
||||
|
||||
- ``./reports_float16/summary/Llama-3.1-70B-Instruct_latency_report.csv``
|
||||
|
||||
- ``./reports_float8/summary/Llama-3.1-70B-Instruct-FP8-KV_latency_report.csv``
|
||||
|
||||
Example 2: throughput benchmark
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Use this command to benchmark the throughput of the Llama 3.1 70B model on eight GPUs with the ``float16`` and ``float8`` data types.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./vllm_benchmark_report.sh -s throughput -m meta-llama/Llama-3.1-70B-Instruct -g 8 -d float16
|
||||
./vllm_benchmark_report.sh -s throughput -m amd/Llama-3.1-70B-Instruct-FP8-KV -g 8 -d float8
|
||||
|
||||
Find the throughput reports at:
|
||||
|
||||
- ``./reports_float16/summary/Llama-3.1-70B-Instruct_throughput_report.csv``
|
||||
|
||||
- ``./reports_float8/summary/Llama-3.1-70B-Instruct-FP8-KV_throughput_report.csv``
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<style>
|
||||
mjx-container[jax="CHTML"][display="true"] {
|
||||
text-align: left;
|
||||
margin: 0;
|
||||
}
|
||||
</style>
|
||||
|
||||
.. note::
|
||||
|
||||
Throughput is calculated as:
|
||||
|
||||
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||
|
||||
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Further reading
|
||||
===============
|
||||
@@ -446,33 +290,3 @@ Further reading
|
||||
|
||||
- To learn how to fine-tune LLMs, see
|
||||
:doc:`Fine-tuning LLMs <../fine-tuning/index>`.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
This table lists previous versions of the ROCm vLLM Docker image for inference
|
||||
performance validation. For detailed information about available models for
|
||||
benchmarking, see the version-specific documentation.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:stub-columns: 1
|
||||
|
||||
* - ROCm version
|
||||
- vLLM version
|
||||
- PyTorch version
|
||||
- Resources
|
||||
|
||||
* - 6.2.1
|
||||
- 0.6.4
|
||||
- 2.5.0
|
||||
-
|
||||
* `Documentation <https://rocm.docs.amd.com/en/docs-6.3.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`_
|
||||
|
||||
* - 6.2.0
|
||||
- 0.4.3
|
||||
- 2.4.0
|
||||
-
|
||||
* `Documentation <https://rocm.docs.amd.com/en/docs-6.2.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`_
|
||||
|
||||
@@ -0,0 +1,547 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: How to train a model using Megatron-LM for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||
|
||||
******************************************
|
||||
Training a model with Megatron-LM for ROCm
|
||||
******************************************
|
||||
|
||||
The Megatron-LM framework for ROCm is a specialized fork of the robust Megatron-LM,
|
||||
designed to enable efficient training of large-scale language models on AMD
|
||||
GPUs. By leveraging AMD Instinct™ MI300X series accelerators, Megatron-LM delivers
|
||||
enhanced scalability, performance, and resource utilization for AI workloads.
|
||||
It is purpose-built to support models like Llama 2, Llama 3, Llama 3.1, and
|
||||
DeepSeek, enabling developers to train next-generation AI models more
|
||||
efficiently. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
|
||||
|
||||
AMD provides a ready-to-use Docker image for MI300X accelerators containing
|
||||
essential components, including PyTorch, ROCm libraries, and Megatron-LM
|
||||
utilities. It contains the following software components to accelerate training
|
||||
workloads:
|
||||
|
||||
+--------------------------+--------------------------------+
|
||||
| Software component | Version |
|
||||
+==========================+================================+
|
||||
| ROCm | 6.3.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| PyTorch | 2.7.0a0+git637433 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Python | 3.10 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformer Engine | 1.11 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Flash Attention | 3.0.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| hipBLASLt | git258a2162 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Triton | 3.1 |
|
||||
+--------------------------+--------------------------------+
|
||||
|
||||
Supported features and models
|
||||
=============================
|
||||
|
||||
Megatron-LM provides the following key features to train large language models efficiently:
|
||||
|
||||
- Transformer Engine (TE)
|
||||
|
||||
- APEX
|
||||
|
||||
- GEMM tuning
|
||||
|
||||
- Torch.compile
|
||||
|
||||
- 3D parallelism: TP + SP + CP
|
||||
|
||||
- Distributed optimizer
|
||||
|
||||
- Flash Attention (FA) 3
|
||||
|
||||
- Fused kernels
|
||||
|
||||
- Pre-training
|
||||
|
||||
.. _amd-megatron-lm-model-support:
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
||||
|
||||
* Llama 2 7B
|
||||
|
||||
* Llama 2 70B
|
||||
|
||||
* Llama 3 8B
|
||||
|
||||
* Llama 3 70B
|
||||
|
||||
* Llama 3.1 8B
|
||||
|
||||
* Llama 3.1 70B
|
||||
|
||||
* DeepSeek-V2-Lite
|
||||
|
||||
.. note::
|
||||
|
||||
Some models, such as Llama 3, require an external license agreement through
|
||||
a third party (for example, Meta).
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
If you have already validated your system settings, skip this step. Otherwise,
|
||||
complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
|
||||
to set up your system before starting training.
|
||||
|
||||
Disable NUMA auto-balancing
|
||||
---------------------------
|
||||
|
||||
Generally, application performance can benefit from disabling NUMA auto-balancing. However,
|
||||
it might be detrimental to performance with certain types of workloads.
|
||||
|
||||
Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
|
||||
Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
|
||||
the output is ``1``, run the following command to disable NUMA auto-balancing.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
|
||||
See :ref:`mi300x-disable-numa` for more information.
|
||||
|
||||
.. _mi300x-amd-megatron-lm-training:
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
|
||||
The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
|
||||
training benchmarks, and achieve superior performance for models like Llama 3.1, Llama 2, and DeepSeek V2.
|
||||
|
||||
Use the following instructions to set up the environment, configure the script to train models, and
|
||||
reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
|
||||
image.
|
||||
|
||||
.. _amd-megatron-lm-requirements:
|
||||
|
||||
Download the Docker image
|
||||
-------------------------
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/megatron-lm:v25.3
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.3
|
||||
|
||||
3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start megatron_training_env
|
||||
docker exec -it megatron_training_env bash
|
||||
|
||||
The Docker container includes a pre-installed, verified version of Megatron-LM from the `release branch <https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3>`_.
|
||||
|
||||
.. _amd-megatron-lm-environment-setup:
|
||||
|
||||
Configuration scripts
|
||||
---------------------
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
If you're working with Llama 2 7B or Llama 2 70 B, use the ``train_llama2.sh`` configuration
|
||||
script in the ``examples/llama`` directory of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/llama>`__.
|
||||
Likewise, if you're working with Llama 3 or Llama 3.1, then use ``train_llama3.sh`` and update
|
||||
the configuration script accordingly.
|
||||
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
Use the ``train_deepseek_v2.sh`` configuration script in the ``examples/deepseek_v2``
|
||||
directory of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/deepseek_v2>`__
|
||||
and update the configuration script accordingly.
|
||||
|
||||
Network interface
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
To avoid connectivity issues in multi-node deployments, ensure the correct network interface
|
||||
is set in your training scripts.
|
||||
|
||||
1. Run the following command (outside the container) to find the active network interface on your system.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
ip a
|
||||
|
||||
2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your system’s network interface. For
|
||||
example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export NCCL_SOCKET_IFNAME=ens50f0np0
|
||||
|
||||
export GLOO_SOCKET_IFNAME=ens50f0np0
|
||||
|
||||
Dataset options
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
You can use either mock data or real data for training.
|
||||
|
||||
* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
|
||||
value is ``1`` for enabled.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
MOCK_DATA=1
|
||||
|
||||
* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
MOCK_DATA=0
|
||||
|
||||
DATA_PATH=${DATA_PATH:-"/data/bookcorpus_text_sentence"} # Change to where your dataset is stored
|
||||
|
||||
Ensure that the files are accessible inside the Docker container.
|
||||
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
If you don't already have the dataset, download the DeepSeek dataset using the following
|
||||
commands:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
mkdir deepseek-datasets
|
||||
cd deepseek-datasets
|
||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
|
||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
|
||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
|
||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
|
||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
|
||||
|
||||
You can use either mock data or real data for training.
|
||||
|
||||
* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
|
||||
value is ``1`` for enabled.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
MOCK_DATA=1
|
||||
|
||||
* If you're using a real dataset, update the ``DATA_DIR`` variable to point to the location of your dataset.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
MOCK_DATA=0
|
||||
|
||||
DATA_DIR="/root/data/deepseek-datasets" # Change to where your dataset is stored
|
||||
|
||||
Ensure that the files are accessible inside the Docker container.
|
||||
|
||||
Tokenizer
|
||||
^^^^^^^^^
|
||||
|
||||
Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
|
||||
models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
|
||||
a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
|
||||
fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
|
||||
handle a variety of input sequences, including unseen words or domain-specific terms.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
|
||||
|
||||
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
|
||||
Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
|
||||
|
||||
For example, if you're using the Llama 3.1 8B model:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
|
||||
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
|
||||
|
||||
Multi-node training
|
||||
^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
If you're running multi-node training, update the following environment variables. They can
|
||||
also be passed as command line arguments.
|
||||
|
||||
* Change ``localhost`` to the master node's hostname:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
MASTER_ADDR="${MASTER_ADDR:-localhost}"
|
||||
|
||||
* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES="${NNODES:-1}"
|
||||
|
||||
* Set the rank of each node (0 for master, 1 for the first worker node, and so on):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NODE_RANK="${NODE_RANK:-0}"
|
||||
|
||||
* Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
|
||||
NFS directory) for multi-node runs:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
|
||||
|
||||
* For multi-node runs, make sure the correct network drivers are installed on the nodes. If
|
||||
inside a Docker, either install the drivers inside the Docker container or pass the network
|
||||
drivers from the host while creating the Docker container.
|
||||
|
||||
Start training on AMD Instinct accelerators
|
||||
===========================================
|
||||
|
||||
The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
|
||||
system performance, conduct training benchmarks, and achieve superior
|
||||
performance for models like Llama 3.1 and Llama 2. This container should not be
|
||||
expected to provide generalized performance across all training workloads. You
|
||||
can expect the container to perform in the model configurations described in
|
||||
the following section, but other configurations are not validated by AMD.
|
||||
|
||||
Use the following instructions to set up the environment, configure the script
|
||||
to train models, and reproduce the benchmark results on MI300X series
|
||||
accelerators with the AMD Megatron-LM Docker image.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Single node training
|
||||
:sync: single-node
|
||||
|
||||
To run training on a single node, navigate to the Megatron-LM folder and use the
|
||||
following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 bash examples/llama/train_llama3.sh
|
||||
|
||||
.. tab-item:: Multi-node training
|
||||
:sync: multi-node
|
||||
|
||||
To run training on multiple nodes, launch the Docker container on each node. For example, for a two node setup (``NODE0`` as the master node), use these commands.
|
||||
|
||||
* On the master node ``NODE0``:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
|
||||
|
||||
* On the worker node ``NODE1``:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
|
||||
|
||||
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
To run the training on a single node, go to ``/Megatron-LM`` folder and use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cd /workspace/Megatron-LM
|
||||
GEMM_TUNING=1 PR=bf16 MBS=4 AC=none bash examples/deepseek_v2/train_deepseekv2.sh
|
||||
|
||||
Key options
|
||||
-----------
|
||||
|
||||
.. _amd-megatron-lm-benchmark-test-vars:
|
||||
|
||||
The benchmark tests support the following sets of variables:
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
``TEE_OUTPUT``
|
||||
``1`` to enable training logs or ``0`` to disable.
|
||||
|
||||
``TE_FP8``
|
||||
``0`` for BP16 (default) or ``1`` for FP8 GEMMs.
|
||||
|
||||
``GEMM_TUNING``
|
||||
``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
|
||||
|
||||
``USE_FLASH_ATTN``
|
||||
``1`` to enable Flash Attention.
|
||||
|
||||
``ENABLE_PROFILING``
|
||||
``1`` to enable PyTorch profiling for performance analysis.
|
||||
|
||||
``transformer-impl``
|
||||
``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
|
||||
|
||||
``MODEL_SIZE``
|
||||
``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2.
|
||||
|
||||
``TOTAL_ITERS``
|
||||
The total number of iterations -- ``10`` by default.
|
||||
|
||||
``MOCK_DATA``
|
||||
``1`` to use mock data or ``0`` to use real data provided by you.
|
||||
|
||||
``MBS``
|
||||
Micro batch size.
|
||||
|
||||
``BS``
|
||||
Global batch size.
|
||||
|
||||
``TP``
|
||||
Tensor parallel (``1``, ``2``, ``4``, ``8``).
|
||||
|
||||
``SEQ_LENGTH``
|
||||
Input sequence length.
|
||||
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
``PR``
|
||||
Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
|
||||
|
||||
``GEMM_TUNING``
|
||||
``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
|
||||
|
||||
``TOTAL_ITERS``
|
||||
The total number of iterations -- ``10`` by default.
|
||||
|
||||
``MOCK_DATA``
|
||||
``1`` to use mock data or ``0`` to use real data provided by you.
|
||||
|
||||
``MBS``
|
||||
Micro batch size.
|
||||
|
||||
``GBS``
|
||||
Global batch size.
|
||||
|
||||
Benchmarking examples
|
||||
---------------------
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Single node training
|
||||
:sync: single-node
|
||||
|
||||
Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
|
||||
datatype, and so on.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
||||
|
||||
See the sample output:
|
||||
|
||||
.. image:: ../../../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
|
||||
:width: 800
|
||||
|
||||
.. tab-item:: Multi-node training
|
||||
:sync: multi-node
|
||||
|
||||
Launch the Docker container on each node.
|
||||
|
||||
In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
|
||||
so on.
|
||||
|
||||
On the master node:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
On the worker node:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
||||
|
||||
Sample output for 2-node training:
|
||||
|
||||
Master node:
|
||||
|
||||
.. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-master.png
|
||||
:width: 800
|
||||
|
||||
Worker node:
|
||||
|
||||
.. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-worker.png
|
||||
:width: 800
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
This table lists previous versions of the ROCm Megatron-LM Docker image for training
|
||||
performance validation. For detailed information about available models for
|
||||
benchmarking, see the version-specific documentation.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:stub-columns: 1
|
||||
|
||||
* - ROCm version
|
||||
- Megatron-LM version
|
||||
- PyTorch version
|
||||
- Resources
|
||||
|
||||
* - 6.1
|
||||
- 24.12-dev
|
||||
- 2.4.0
|
||||
-
|
||||
* `Documentation <https://rocm.docs.amd.com/en/docs-6.3.0/how-to/rocm-for-ai/train-a-model.html>`_
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/24.12-dev/images/sha256-5818c50334ce3d69deeeb8f589d83ec29003817da34158ebc9e2d112b929bf2e>`_
|
||||
@@ -0,0 +1,341 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: How to train a model using PyTorch for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
||||
|
||||
**************************************
|
||||
Training a model with PyTorch for ROCm
|
||||
**************************************
|
||||
|
||||
PyTorch is an open-source machine learning framework that is widely used for
|
||||
model training with GPU-optimized components for transformer-based models.
|
||||
|
||||
The PyTorch for ROCm training Docker (``rocm/pytorch-training:v25.3``) image
|
||||
provides a prebuilt optimized environment for fine-tuning and pretraining a
|
||||
model on AMD Instinct MI325X and MI300X accelerators. It includes the following
|
||||
software components to accelerate training workloads:
|
||||
|
||||
+--------------------------+--------------------------------+
|
||||
| Software component | Version |
|
||||
+==========================+================================+
|
||||
| ROCm | 6.3.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| PyTorch | 2.7.0a0+git637433 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Python | 3.10 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformer Engine | 1.11 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Flash Attention | 3.0.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| hipBLASLt | git258a2162 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Triton | 3.1 |
|
||||
+--------------------------+--------------------------------+
|
||||
|
||||
.. _amd-pytorch-training-model-support:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
||||
|
||||
* Llama 3.1 8B
|
||||
|
||||
* Llama 3.1 70B
|
||||
|
||||
* FLUX.1-dev
|
||||
|
||||
.. note::
|
||||
|
||||
Only these models are supported in the following steps.
|
||||
|
||||
Some models, such as Llama 3, require an external license agreement through
|
||||
a third party (for example, Meta).
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
If you have already validated your system settings, skip this step. Otherwise,
|
||||
complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
|
||||
to set up your system before starting training.
|
||||
|
||||
Disable NUMA auto-balancing
|
||||
---------------------------
|
||||
|
||||
Generally, application performance can benefit from disabling NUMA auto-balancing. However,
|
||||
it might be detrimental to performance with certain types of workloads.
|
||||
|
||||
Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
|
||||
Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
|
||||
the output is ``1``, run the following command to disable NUMA auto-balancing.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
|
||||
See :ref:`mi300x-disable-numa` for more information.
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t validate configurations and run conditions outside those described.
|
||||
|
||||
Download the Docker image
|
||||
-------------------------
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/pytorch-training:v25.3
|
||||
|
||||
2. Run the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.3
|
||||
|
||||
3. Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
4. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__ repository and navigate to the benchmark scripts directory.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/pytorch-train
|
||||
|
||||
Prepare training datasets and dependencies
|
||||
------------------------------------------
|
||||
|
||||
The following benchmarking examples may require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
Run the setup script to install libraries and datasets needed for benchmarking.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_setup.sh
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Benchmark model
|
||||
- Reference
|
||||
|
||||
* - ``accelerate``
|
||||
- Llama 3.1 8B, FLUX
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
* - ``datasets``
|
||||
- Llama 3.1 8B, 70B, FLUX
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* - ``torchdata``
|
||||
- Llama 3.1 70B
|
||||
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
||||
|
||||
* - ``tomli``
|
||||
- Llama 3.1 70B
|
||||
- `Tomli <https://pypi.org/project/tomli/>`_
|
||||
|
||||
* - ``tiktoken``
|
||||
- Llama 3.1 70B
|
||||
- `tiktoken <https://github.com/openai/tiktoken>`_
|
||||
|
||||
* - ``blobfile``
|
||||
- Llama 3.1 70B
|
||||
- `blobfile <https://pypi.org/project/blobfile/>`_
|
||||
|
||||
* - ``tabulate``
|
||||
- Llama 3.1 70B
|
||||
- `tabulate <https://pypi.org/project/tabulate/>`_
|
||||
|
||||
* - ``wandb``
|
||||
- Llama 3.1 70B
|
||||
- `Weights & Biases <https://github.com/wandb/wandb>`_
|
||||
|
||||
* - ``sentencepiece``
|
||||
- Llama 3.1 70B, FLUX
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||
|
||||
* - ``tensorboard``
|
||||
- Llama 3.1 70 B, FLUX
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||
|
||||
* - ``csvkit``
|
||||
- FLUX
|
||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
|
||||
|
||||
* - ``deepspeed``
|
||||
- FLUX
|
||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
|
||||
|
||||
* - ``diffusers``
|
||||
- FLUX
|
||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
|
||||
|
||||
* - ``GitPython``
|
||||
- FLUX
|
||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
|
||||
|
||||
* - ``opencv-python-headless``
|
||||
- FLUX
|
||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
|
||||
|
||||
* - ``peft``
|
||||
- FLUX
|
||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
|
||||
|
||||
* - ``protobuf``
|
||||
- FLUX
|
||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
|
||||
|
||||
* - ``pytest``
|
||||
- FLUX
|
||||
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
|
||||
|
||||
* - ``python-dotenv``
|
||||
- FLUX
|
||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
|
||||
|
||||
* - ``seaborn``
|
||||
- FLUX
|
||||
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
|
||||
|
||||
* - ``transformers``
|
||||
- FLUX
|
||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
|
||||
|
||||
``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
|
||||
|
||||
* `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
|
||||
|
||||
* `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
|
||||
Along with the following datasets:
|
||||
|
||||
* `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
|
||||
|
||||
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
|
||||
|
||||
Start training on AMD Instinct accelerators
|
||||
===========================================
|
||||
|
||||
The prebuilt PyTorch with ROCm training environment allows users to quickly validate
|
||||
system performance, conduct training benchmarks, and achieve superior
|
||||
performance for models like Llama 3.1 and Llama 2. This container should not be
|
||||
expected to provide generalized performance across all training workloads. You
|
||||
can expect the container to perform in the model configurations described in
|
||||
the following section, but other configurations are not validated by AMD.
|
||||
|
||||
Use the following instructions to set up the environment, configure the script
|
||||
to train models, and reproduce the benchmark results on MI300X series
|
||||
accelerators with the AMD PyTorch training Docker image.
|
||||
|
||||
Once your environment is set up, use the following commands and examples to start benchmarking.
|
||||
|
||||
Pretraining
|
||||
-----------
|
||||
|
||||
To start the pretraining benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
|
||||
|
||||
Options and available models
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
* - ``$training_mode``
|
||||
- ``pretrain``
|
||||
- Benchmark pretraining
|
||||
|
||||
* -
|
||||
- ``finetune_fw``
|
||||
- Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
|
||||
|
||||
* -
|
||||
- ``finetune_lora``
|
||||
- Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
|
||||
|
||||
* - ``$datatype``
|
||||
- FP8 or BF16
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
|
||||
* - ``$model_repo``
|
||||
- Llama-3.1-8B
|
||||
- `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
|
||||
|
||||
* -
|
||||
- Llama-3.1-70B
|
||||
- `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
|
||||
|
||||
* -
|
||||
- Flux
|
||||
- `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
|
||||
Fine-tuning
|
||||
-----------
|
||||
|
||||
To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 2 70B
|
||||
with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
|
||||
|
||||
Benchmarking examples
|
||||
---------------------
|
||||
|
||||
Here are some examples of how to use the command.
|
||||
|
||||
* Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
|
||||
|
||||
* Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
|
||||
|
||||
* Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
|
||||
|
||||
* Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
|
||||
|
||||
* Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
|
||||
@@ -19,6 +19,10 @@ training, fine-tuning, and inference. It leverages popular machine learning fram
|
||||
|
||||
In this guide, you'll learn about:
|
||||
|
||||
- :doc:`Training a model <train-a-model>`
|
||||
- Training a model
|
||||
|
||||
- :doc:`Scale model training <scale-model-training>`
|
||||
- :doc:`Train a model with Megatron-LM <benchmark-docker/megatron-lm>`
|
||||
|
||||
- :doc:`Train a model with PyTorch <benchmark-docker/pytorch-training>`
|
||||
|
||||
- :doc:`Scaling model training <scale-model-training>`
|
||||
|
||||
@@ -0,0 +1,130 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Prerequisite system validation before using ROCm for AI.
|
||||
:keywords: ROCm, AI, LLM, train, megatron, Llama, tutorial, docker, torch, pytorch, jax
|
||||
|
||||
.. _train-a-model-system-validation:
|
||||
|
||||
**********************************************
|
||||
Prerequisite system validation before training
|
||||
**********************************************
|
||||
|
||||
Complete the following system validation and optimization steps to set up your system before starting training.
|
||||
|
||||
Disable NUMA auto-balancing
|
||||
---------------------------
|
||||
|
||||
Generally, application performance can benefit from disabling NUMA auto-balancing. However,
|
||||
it might be detrimental to performance with certain types of workloads.
|
||||
|
||||
Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
|
||||
Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
|
||||
the output is ``1``, run the following command to disable NUMA auto-balancing.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
|
||||
See :ref:`mi300x-disable-numa` for more information.
|
||||
|
||||
Hardware verification with ROCm
|
||||
-------------------------------
|
||||
|
||||
Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
|
||||
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
|
||||
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
|
||||
You can restore this setting to its default value with the ``rocm-smi -r`` command.
|
||||
|
||||
Run the command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
rocm-smi --setperfdeterminism 1900
|
||||
|
||||
See :ref:`mi300x-hardware-verification-with-rocm` for more information.
|
||||
|
||||
RCCL Bandwidth Test for multi-node setups
|
||||
-----------------------------------------
|
||||
|
||||
ROCm Collective Communications Library (RCCL) is a standalone library of standard collective communication
|
||||
routines for GPUs. See the :doc:`RCCL documentation <rccl:index>` for more information. Before starting
|
||||
pretraining, running a RCCL bandwidth test helps ensure that the multi-GPU or multi-node setup is optimized
|
||||
for efficient distributed training.
|
||||
|
||||
Running the RCCL bandwidth test helps verify that:
|
||||
|
||||
- The GPUs can communicate across nodes or within a single node.
|
||||
|
||||
- The interconnect (such as InfiniBand, Ethernet, or Infinite fabric) is functioning as expected and
|
||||
provides adequate bandwidth for communication.
|
||||
|
||||
- No hardware setup or cabling issues could affect the communication between GPUs
|
||||
|
||||
Tuning and optimizing hyperparameters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
In distributed training, specific hyperparameters related to distributed communication can be tuned based on
|
||||
the results of the RCCL bandwidth test. These variables are already set in the Docker image:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# force all RCCL streams to be high priority
|
||||
export TORCH_NCCL_HIGH_PRIORITY=1
|
||||
|
||||
# specify which RDMA interfaces to use for communication
|
||||
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
|
||||
|
||||
# define the Global ID index used in RoCE mode
|
||||
export NCCL_IB_GID_INDEX=3
|
||||
|
||||
# avoid data corruption/mismatch issue that existed in past releases
|
||||
export RCCL_MSCCL_ENABLE=0
|
||||
|
||||
Running the RCCL Bandwidth Test
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
It's recommended you run the RCCL bandwidth test before launching training. It ensures system
|
||||
performance is sufficient to launch training. RCCL is not included in the AMD Megatron-LM Docker
|
||||
image; follow the instructions in `<https://github.com/ROCm/rccl-tests>`__ to get started.
|
||||
See :ref:`mi300x-rccl` for more information.
|
||||
|
||||
Run on 8 GPUs (``-g 8``), scanning from 8 bytes to 10 GB:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./build/all_reduce_perf -b 8 -e 10G -f 2 -g 8
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-8-gpu.png
|
||||
:width: 800
|
||||
|
||||
Using one MPI process per GPU and ``-g 1`` for performance-oriented runs on both single-node and multi-node is
|
||||
recommended. So, a run on 8 GPUs looks something like:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 1
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-1-mpi-process-per-gpu.png
|
||||
:width: 800
|
||||
|
||||
Running with one MPI process per GPU ensures a one-to-one mapping for CPUs and GPUs, which can be beneficial
|
||||
for smaller message sizes. This better represents the real-world use of RCCL in deep learning frameworks like
|
||||
PyTorch and TensorFlow.
|
||||
|
||||
Use the following script to run the RCCL test for four MI300X GPU nodes. Modify paths and node addresses as needed.
|
||||
|
||||
.. code-block::
|
||||
|
||||
/home/$USER/ompi_for_gpu/ompi/bin/mpirun -np 32 -H tw022:8,tw024:8,tw010:8, tw015:8 \
|
||||
--mca pml ucx \
|
||||
--mca btl ^openib \
|
||||
-x NCCL_SOCKET_IFNAME=ens50f0np0 \
|
||||
-x NCCL_IB_HCA=rdma0:1,rdma1:1,rdma2:1,rdma3:1,rdma4:1,rdma5:1,rdma6:1,rdma7:1 \
|
||||
-x NCCL_IB_GID_INDEX=3 \
|
||||
-x NCCL_MIN_NCHANNELS=40 \
|
||||
-x NCCL_DEBUG=version \
|
||||
$HOME/rccl-tests/build/all_reduce_perf -b 8 -e 8g -f 2 -g 1
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-4-mi300x-gpu-nodes.png
|
||||
:width: 800
|
||||
@@ -1,503 +0,0 @@
|
||||
.. meta::
|
||||
:description: How to train a model using ROCm Megatron-LM
|
||||
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||
|
||||
**************************************
|
||||
Training a model with ROCm Megatron-LM
|
||||
**************************************
|
||||
|
||||
.. _amd-megatron-lm:
|
||||
|
||||
The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
|
||||
enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
|
||||
accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
|
||||
workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
|
||||
like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
|
||||
efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
|
||||
|
||||
For ease of use, AMD provides a ready-to-use Docker image for MI300X accelerators containing essential
|
||||
components, including PyTorch, PyTorch Lightning, ROCm libraries, and Megatron-LM utilities. It contains the
|
||||
following software to accelerate training workloads:
|
||||
|
||||
+--------------------------+--------------------------------+
|
||||
| Software component | Version |
|
||||
+==========================+================================+
|
||||
| ROCm | 6.1 |
|
||||
+--------------------------+--------------------------------+
|
||||
| PyTorch | 2.4.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| PyTorch Lightning | 2.4.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Megatron Core | 0.9.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformer Engine | 1.5.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Flash Attention | v2.6 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformers | 4.44.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
|
||||
Supported features and models
|
||||
=============================
|
||||
|
||||
Megatron-LM provides the following key features to train large language models efficiently:
|
||||
|
||||
- Transformer Engine (TE)
|
||||
|
||||
- APEX
|
||||
|
||||
- GEMM tuning
|
||||
|
||||
- Torch.compile
|
||||
|
||||
- 3D parallelism: TP + SP + CP
|
||||
|
||||
- Distributed optimizer
|
||||
|
||||
- Flash Attention (FA) 2
|
||||
|
||||
- Fused kernels
|
||||
|
||||
- Pre-training
|
||||
|
||||
.. _amd-megatron-lm-model-support:
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
||||
|
||||
* Llama 2 7B
|
||||
|
||||
* Llama 2 70B
|
||||
|
||||
* Llama 3 8B
|
||||
|
||||
* Llama 3 70B
|
||||
|
||||
* Llama 3.1 8B
|
||||
|
||||
* Llama 3.1 70B
|
||||
|
||||
Prerequisite system validation steps
|
||||
====================================
|
||||
|
||||
Complete the following system validation and optimization steps to set up your system before starting training.
|
||||
|
||||
Disable NUMA auto-balancing
|
||||
---------------------------
|
||||
|
||||
Generally, application performance can benefit from disabling NUMA auto-balancing. However,
|
||||
it might be detrimental to performance with certain types of workloads.
|
||||
|
||||
Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
|
||||
Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
|
||||
the output is ``1``, run the following command to disable NUMA auto-balancing.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
|
||||
See :ref:`mi300x-disable-numa` for more information.
|
||||
|
||||
Hardware verification with ROCm
|
||||
-------------------------------
|
||||
|
||||
Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
|
||||
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
|
||||
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
|
||||
You can restore this setting to its default value with the ``rocm-smi -r`` command.
|
||||
|
||||
Run the command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
rocm-smi --setperfdeterminism 1900
|
||||
|
||||
See :ref:`mi300x-hardware-verification-with-rocm` for more information.
|
||||
|
||||
RCCL Bandwidth Test
|
||||
-------------------
|
||||
|
||||
ROCm Collective Communications Library (RCCL) is a standalone library of standard collective communication
|
||||
routines for GPUs. See the :doc:`RCCL documentation <rccl:index>` for more information. Before starting
|
||||
pre-training, running a RCCL bandwidth test helps ensure that the multi-GPU or multi-node setup is optimized
|
||||
for efficient distributed training.
|
||||
|
||||
Running the RCCL bandwidth test helps verify that:
|
||||
|
||||
- The GPUs can communicate across nodes or within a single node.
|
||||
|
||||
- The interconnect (such as InfiniBand, Ethernet, or Infinite fabric) is functioning as expected and
|
||||
provides adequate bandwidth for communication.
|
||||
|
||||
- No hardware setup or cabling issues could affect the communication between GPUs
|
||||
|
||||
Tuning and optimizing hyperparameters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
In distributed training, specific hyperparameters related to distributed communication can be tuned based on
|
||||
the results of the RCCL bandwidth test. These variables are already set in the Docker image:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# force all RCCL streams to be high priority
|
||||
export TORCH_NCCL_HIGH_PRIORITY=1
|
||||
|
||||
# specify which RDMA interfaces to use for communication
|
||||
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
|
||||
|
||||
# define the Global ID index used in RoCE mode
|
||||
export NCCL_IB_GID_INDEX=3
|
||||
|
||||
# avoid data corruption/mismatch issue that existed in past releases
|
||||
export RCCL_MSCCL_ENABLE=0
|
||||
|
||||
Running the RCCL Bandwidth Test
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
It's recommended you run the RCCL bandwidth test before launching training. It ensures system
|
||||
performance is sufficient to launch training. RCCL is not included in the AMD Megatron-LM Docker
|
||||
image; follow the instructions in `<https://github.com/ROCm/rccl-tests>`__ to get started.
|
||||
See :ref:`mi300x-rccl` for more information.
|
||||
|
||||
Run on 8 GPUs (``-g 8``), scanning from 8 bytes to 10 GB:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./build/all_reduce_perf -b 8 -e 10G -f 2 -g 8
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-8-gpu.png
|
||||
:width: 800
|
||||
|
||||
Using one MPI process per GPU and ``-g 1`` for performance-oriented runs on both single-node and multi-node is
|
||||
recommended. So, a run on 8 GPUs looks something like:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 1
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-1-mpi-process-per-gpu.png
|
||||
:width: 800
|
||||
|
||||
Running with one MPI process per GPU ensures a one-to-one mapping for CPUs and GPUs, which can be beneficial
|
||||
for smaller message sizes. This better represents the real-world use of RCCL in deep learning frameworks like
|
||||
PyTorch and TensorFlow.
|
||||
|
||||
Use the following script to run the RCCL test for four MI300X GPU nodes. Modify paths and node addresses as needed.
|
||||
|
||||
.. code-block::
|
||||
|
||||
/home/$USER/ompi_for_gpu/ompi/bin/mpirun -np 32 -H tw022:8,tw024:8,tw010:8, tw015:8 \
|
||||
--mca pml ucx \
|
||||
--mca btl ^openib \
|
||||
-x NCCL_SOCKET_IFNAME=ens50f0np0 \
|
||||
-x NCCL_IB_HCA=rdma0:1,rdma1:1,rdma2:1,rdma3:1,rdma4:1,rdma5:1,rdma6:1,rdma7:1 \
|
||||
-x NCCL_IB_GID_INDEX=3 \
|
||||
-x NCCL_MIN_NCHANNELS=40 \
|
||||
-x NCCL_DEBUG=version \
|
||||
$HOME/rccl-tests/build/all_reduce_perf -b 8 -e 8g -f 2 -g 1
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/rccl-tests-4-mi300x-gpu-nodes.png
|
||||
:width: 800
|
||||
|
||||
.. _mi300x-amd-megatron-lm-training:
|
||||
|
||||
Start training on MI300X accelerators
|
||||
=====================================
|
||||
|
||||
The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
|
||||
training benchmarks, and achieve superior performance for models like Llama 2 and Llama 3.1.
|
||||
|
||||
Use the following instructions to set up the environment, configure the script to train models, and
|
||||
reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
|
||||
image.
|
||||
|
||||
.. _amd-megatron-lm-requirements:
|
||||
|
||||
Download the Docker image and required packages
|
||||
-----------------------------------------------
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/megatron-lm:24.12-dev
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $CACHE_DIR:/root/.cache --name megatron-dev-env rocm/megatron-lm:24.12-dev /bin/bash
|
||||
|
||||
3. Clone the ROCm Megatron-LM repository to a local directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/Megatron-LM
|
||||
cd Megatron-LM
|
||||
|
||||
.. note::
|
||||
|
||||
This release is validated with ``ROCm/Megatron-LM`` commit `bb93ccb <https://github.com/ROCm/Megatron-LM/tree/bb93ccbfeae6363c67b361a97a27c74ab86e7e92>`_.
|
||||
Checking out this specific commit is recommended for a stable and reproducible environment.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git checkout bb93ccbfeae6363c67b361a97a27c74ab86e7e92
|
||||
|
||||
Prepare training datasets
|
||||
-------------------------
|
||||
|
||||
If you already have the preprocessed data, you can skip this section.
|
||||
|
||||
Use the following command to process datasets. We use GPT data as an example. You may change the merge table, use an
|
||||
end-of-document token, remove sentence splitting, and use the tokenizer type.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
python tools/preprocess_data.py \
|
||||
--input my-corpus.json \
|
||||
--output-prefix my-gpt2 \
|
||||
--vocab-file gpt2-vocab.json \
|
||||
--tokenizer-type GPT2BPETokenizer \
|
||||
--merge-file gpt2-merges.txt \
|
||||
--append-eod
|
||||
|
||||
In this case, the automatically generated output files are named ``my-gpt2_text_document.bin`` and
|
||||
``my-gpt2_text_document.idx``.
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/prep-training-datasets-my-gpt2-text-document.png
|
||||
:width: 800
|
||||
|
||||
.. _amd-megatron-lm-environment-setup:
|
||||
|
||||
Environment setup
|
||||
-----------------
|
||||
|
||||
In the ``examples/llama`` directory of Megatron-LM, if you're working with Llama 2 7B or Llama 2 70 B, use the
|
||||
``train_llama2.sh`` configuration script. Likewise, if you're working with Llama 3 or Llama 3.1, then use
|
||||
``train_llama3.sh`` and update the configuration script accordingly.
|
||||
|
||||
Network interface
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
To avoid connectivity issues, ensure the correct network interface is set in your training scripts.
|
||||
|
||||
1. Run the following command to find the active network interface on your system.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
ip a
|
||||
|
||||
2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your system’s network interface. For
|
||||
example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export NCCL_SOCKET_IFNAME=ens50f0np0
|
||||
|
||||
export GLOO_SOCKET_IFNAME=ens50f0np0
|
||||
|
||||
Dataset options
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
You can use either mock data or real data for training.
|
||||
|
||||
* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
DATA_DIR="/root/.cache/data" # Change to where your dataset is stored
|
||||
|
||||
DATA_PATH=${DATA_DIR}/bookcorpus_text_sentence
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
--data-path $DATA_PATH
|
||||
|
||||
Ensure that the files are accessible inside the Docker container.
|
||||
|
||||
* Mock data can be useful for testing and validation. If you're using mock data, replace ``--data-path $DATA_PATH`` with the ``--mock-data`` option.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
--mock-data
|
||||
|
||||
Tokenizer
|
||||
^^^^^^^^^
|
||||
|
||||
Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
|
||||
models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
|
||||
a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
|
||||
fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
|
||||
handle a variety of input sequences, including unseen words or domain-specific terms.
|
||||
|
||||
To train any of the Llama 2 models that this Docker image supports, use the ``Llama2Tokenizer``.
|
||||
|
||||
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
|
||||
Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
|
||||
|
||||
For example, if you're using the Llama 3.1 8B model:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
|
||||
|
||||
Run benchmark tests
|
||||
-------------------
|
||||
|
||||
.. note::
|
||||
|
||||
If you're running **multi node training**, update the following environment variables. They can
|
||||
also be passed as command line arguments.
|
||||
|
||||
* Change ``localhost`` to the master node's hostname:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
MASTER_ADDR="${MASTER_ADDR:-localhost}"
|
||||
|
||||
* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES="${NNODES:-1}"
|
||||
|
||||
* Set the rank of each node (0 for master, 1 for the first worker node, and so on):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NODE_RANK="${NODE_RANK:-0}"
|
||||
|
||||
* Use this command to run a performance benchmark test of any of the Llama 2 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
{variables} bash examples/llama/train_llama2.sh
|
||||
|
||||
* Use this command to run a performance benchmark test of any of the Llama 3 and Llama 3.1 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
{variables} bash examples/llama/train_llama3.sh
|
||||
|
||||
.. _amd-megatron-lm-benchmark-test-vars:
|
||||
|
||||
The benchmark tests support the same set of variables:
|
||||
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| Name | Options | Description |
|
||||
+==========================+=======================+=======================+
|
||||
| ``TEE_OUTPUT`` | 0 or 1 | 0: disable training |
|
||||
| | | log |
|
||||
| | | |
|
||||
| | | 1: enable training |
|
||||
| | | log |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``MBS`` | | Micro batch size |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``BS`` | | Batch size |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``TP`` | 1, 2, 4, 8 | Tensor parallel |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``TE_FP8`` | 0 or 1 | Datatype. |
|
||||
| | | If it is set to 1, |
|
||||
| | | FP8. |
|
||||
| | | |
|
||||
| | | If it is set to 0. |
|
||||
| | | BP16 |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``NO_TORCH_COMPILE`` | 0 or 1 | If it is set to 1, |
|
||||
| | | enable torch.compile. |
|
||||
| | | |
|
||||
| | | If it is set to 0. |
|
||||
| | | Disable torch.compile |
|
||||
| | | (default) |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``SEQ_LENGTH`` | | Input sequence length |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``GEMM_TUNING`` | 0 or 1 | If it is set to 1, |
|
||||
| | | enable gemm tuning. |
|
||||
| | | |
|
||||
| | | If it is set to 0, |
|
||||
| | | disable gemm tuning |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``USE_FLASH_ATTN`` | 0 or 1 | 0: disable flash |
|
||||
| | | attention |
|
||||
| | | |
|
||||
| | | 1: enable flash |
|
||||
| | | attention |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``ENABLE_PROFILING`` | 0 or 1 | 0: disable torch |
|
||||
| | | profiling |
|
||||
| | | |
|
||||
| | | 1: enable torch |
|
||||
| | | profiling |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``MODEL_SIZE`` | | The size of the mode: |
|
||||
| | | 7B/70B, etc. |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``TOTAL_ITERS`` | | Total number of |
|
||||
| | | iterations |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
| ``transformer-impl`` | transformer_engine or | Enable transformer |
|
||||
| | local | engine by default |
|
||||
+--------------------------+-----------------------+-----------------------+
|
||||
|
||||
Benchmarking examples
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Single node training
|
||||
:sync: single
|
||||
|
||||
Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
|
||||
datatype, and so on.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
||||
|
||||
See the sample output:
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
|
||||
:width: 800
|
||||
|
||||
.. tab-item:: Multi node training
|
||||
:sync: multi
|
||||
|
||||
Launch the Docker container on each node.
|
||||
|
||||
In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
|
||||
so on.
|
||||
|
||||
On the master node:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
On the worker node:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||
|
||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
||||
|
||||
Sample output for 2-node training:
|
||||
|
||||
Master node:
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/2-node-training-master.png
|
||||
:width: 800
|
||||
|
||||
Worker node:
|
||||
|
||||
.. image:: ../../../data/how-to/rocm-for-ai/2-node-training-worker.png
|
||||
:width: 800
|
||||
|
||||
@@ -21,8 +21,6 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- Model
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
- Device Major version
|
||||
- Device Minor version
|
||||
- VRAM (GiB)
|
||||
- Compute Units
|
||||
- Wavefront Size
|
||||
@@ -34,12 +32,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- L1 Instruction Cache (KiB)
|
||||
- VGPR File (KiB)
|
||||
- SGPR File (KiB)
|
||||
- GFXIP Major version
|
||||
- GFXIP Minor version
|
||||
*
|
||||
- MI325X
|
||||
- CDNA3
|
||||
- gfx942
|
||||
- 9
|
||||
- 4
|
||||
- 256
|
||||
- 304 (38 per XCD)
|
||||
- 64
|
||||
@@ -51,12 +49,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 64 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 4
|
||||
*
|
||||
- MI300X
|
||||
- CDNA3
|
||||
- gfx942
|
||||
- 9
|
||||
- 4
|
||||
- 192
|
||||
- 304 (38 per XCD)
|
||||
- 64
|
||||
@@ -68,12 +66,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 64 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 4
|
||||
*
|
||||
- MI300A
|
||||
- CDNA3
|
||||
- gfx942
|
||||
- 9
|
||||
- 4
|
||||
- 128
|
||||
- 228 (38 per XCD)
|
||||
- 64
|
||||
@@ -85,12 +83,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 64 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 4
|
||||
*
|
||||
- MI250X
|
||||
- CDNA2
|
||||
- gfx90a
|
||||
- 9
|
||||
- 0
|
||||
- 128
|
||||
- 220 (110 per GCD)
|
||||
- 64
|
||||
@@ -102,12 +100,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI250
|
||||
- CDNA2
|
||||
- gfx90a
|
||||
- 9
|
||||
- 0
|
||||
- 128
|
||||
- 208 (104 per GCD)
|
||||
- 64
|
||||
@@ -119,12 +117,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI210
|
||||
- CDNA2
|
||||
- gfx90a
|
||||
- 9
|
||||
- 0
|
||||
- 64
|
||||
- 104
|
||||
- 64
|
||||
@@ -136,12 +134,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI100
|
||||
- CDNA
|
||||
- gfx908
|
||||
- 9
|
||||
- 0
|
||||
- 32
|
||||
- 120
|
||||
- 64
|
||||
@@ -153,12 +151,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256 VGPR and 256 AccVGPR
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI60
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 9
|
||||
- 0
|
||||
- 32
|
||||
- 64
|
||||
- 64
|
||||
@@ -170,12 +168,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI50 (32GB)
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 9
|
||||
- 0
|
||||
- 32
|
||||
- 60
|
||||
- 64
|
||||
@@ -187,12 +185,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI50 (16GB)
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 9
|
||||
- 0
|
||||
- 16
|
||||
- 60
|
||||
- 64
|
||||
@@ -204,12 +202,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI25
|
||||
- GCN5.0
|
||||
- gfx900
|
||||
- 9
|
||||
- 0
|
||||
- 16
|
||||
- 64
|
||||
- 64
|
||||
@@ -221,12 +219,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
*
|
||||
- MI8
|
||||
- GCN3.0
|
||||
- gfx803
|
||||
- 8
|
||||
- 0
|
||||
- 4
|
||||
- 64
|
||||
- 64
|
||||
@@ -238,12 +236,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 4 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 8
|
||||
- 0
|
||||
*
|
||||
- MI6
|
||||
- GCN4.0
|
||||
- gfx803
|
||||
- 8
|
||||
- 0
|
||||
- 16
|
||||
- 36
|
||||
- 64
|
||||
@@ -255,6 +253,8 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 4 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 8
|
||||
- 0
|
||||
|
||||
.. tab-item:: AMD Radeon PRO GPUs
|
||||
|
||||
@@ -266,8 +266,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- Model
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
- Device Major version
|
||||
- Device Minor version
|
||||
|
||||
- VRAM (GiB)
|
||||
- Compute Units
|
||||
- Wavefront Size
|
||||
@@ -280,12 +279,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- L0 Instruction Cache (KiB)
|
||||
- VGPR File (KiB)
|
||||
- SGPR File (KiB)
|
||||
- GFXIP Major version
|
||||
- GFXIP Minor version
|
||||
*
|
||||
- Radeon PRO V710
|
||||
- RDNA3
|
||||
- gfx1101
|
||||
- 11
|
||||
- 0
|
||||
- 28
|
||||
- 54
|
||||
- 32
|
||||
@@ -298,12 +297,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon PRO W7900 Dual Slot
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 48
|
||||
- 96
|
||||
- 32
|
||||
@@ -316,12 +315,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon PRO W7900
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 48
|
||||
- 96
|
||||
- 32
|
||||
@@ -334,12 +333,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon PRO W7800
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 32
|
||||
- 70
|
||||
- 32
|
||||
@@ -352,12 +351,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon PRO W7700
|
||||
- RDNA3
|
||||
- gfx1101
|
||||
- 11
|
||||
- 0
|
||||
- 16
|
||||
- 48
|
||||
- 32
|
||||
@@ -370,12 +369,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon PRO W6800
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 32
|
||||
- 60
|
||||
- 32
|
||||
@@ -388,12 +387,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon PRO W6600
|
||||
- RDNA2
|
||||
- gfx1032
|
||||
- 10
|
||||
- 3
|
||||
- 8
|
||||
- 28
|
||||
- 32
|
||||
@@ -406,12 +405,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon PRO V620
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 32
|
||||
- 72
|
||||
- 32
|
||||
@@ -424,12 +423,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon Pro W5500
|
||||
- RDNA
|
||||
- gfx1012
|
||||
- 10
|
||||
- 1
|
||||
- 8
|
||||
- 22
|
||||
- 32
|
||||
@@ -442,12 +441,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 20
|
||||
- 10
|
||||
- 1
|
||||
*
|
||||
- Radeon Pro VII
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 9
|
||||
- 0
|
||||
- 16
|
||||
- 60
|
||||
- 64
|
||||
@@ -460,6 +459,8 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
|
||||
.. tab-item:: AMD Radeon GPUs
|
||||
|
||||
@@ -471,8 +472,6 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- Model
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
- Device Major version
|
||||
- Device Minor version
|
||||
- VRAM (GiB)
|
||||
- Compute Units
|
||||
- Wavefront Size
|
||||
@@ -485,12 +484,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- L0 Instruction Cache (KiB)
|
||||
- VGPR File (KiB)
|
||||
- SGPR File (KiB)
|
||||
- GFXIP Major version
|
||||
- GFXIP Minor version
|
||||
*
|
||||
- Radeon RX 7900 XTX
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 24
|
||||
- 96
|
||||
- 32
|
||||
@@ -503,12 +502,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7900 XT
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 20
|
||||
- 84
|
||||
- 32
|
||||
@@ -521,12 +520,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7900 GRE
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 11
|
||||
- 0
|
||||
- 16
|
||||
- 80
|
||||
- 32
|
||||
@@ -539,12 +538,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7800 XT
|
||||
- RDNA3
|
||||
- gfx1101
|
||||
- 11
|
||||
- 0
|
||||
- 16
|
||||
- 60
|
||||
- 32
|
||||
@@ -557,12 +556,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7700 XT
|
||||
- RDNA3
|
||||
- gfx1101
|
||||
- 11
|
||||
- 0
|
||||
- 12
|
||||
- 54
|
||||
- 32
|
||||
@@ -575,12 +574,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 768
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7600
|
||||
- RDNA3
|
||||
- gfx1102
|
||||
- 11
|
||||
- 0
|
||||
- 8
|
||||
- 32
|
||||
- 32
|
||||
@@ -593,12 +592,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 6950 XT
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 16
|
||||
- 80
|
||||
- 32
|
||||
@@ -611,12 +610,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6900 XT
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 16
|
||||
- 80
|
||||
- 32
|
||||
@@ -629,12 +628,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6800 XT
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 16
|
||||
- 72
|
||||
- 32
|
||||
@@ -647,12 +646,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6800
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 10
|
||||
- 3
|
||||
- 16
|
||||
- 60
|
||||
- 32
|
||||
@@ -665,12 +664,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6750 XT
|
||||
- RDNA2
|
||||
- gfx1031
|
||||
- 10
|
||||
- 3
|
||||
- 12
|
||||
- 40
|
||||
- 32
|
||||
@@ -683,12 +682,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6700 XT
|
||||
- RDNA2
|
||||
- gfx1031
|
||||
- 10
|
||||
- 3
|
||||
- 12
|
||||
- 40
|
||||
- 32
|
||||
@@ -701,13 +700,13 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6700
|
||||
- RDNA2
|
||||
- gfx1031
|
||||
- 10
|
||||
- 3
|
||||
- 10
|
||||
- 36
|
||||
- 32
|
||||
- 128
|
||||
@@ -719,12 +718,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6650 XT
|
||||
- RDNA2
|
||||
- gfx1032
|
||||
- 10
|
||||
- 3
|
||||
- 8
|
||||
- 32
|
||||
- 32
|
||||
@@ -737,12 +736,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6600 XT
|
||||
- RDNA2
|
||||
- gfx1032
|
||||
- 10
|
||||
- 3
|
||||
- 8
|
||||
- 32
|
||||
- 32
|
||||
@@ -755,12 +754,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon RX 6600
|
||||
- RDNA2
|
||||
- gfx1032
|
||||
- 10
|
||||
- 3
|
||||
- 8
|
||||
- 28
|
||||
- 32
|
||||
@@ -773,12 +772,12 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 512
|
||||
- 16
|
||||
- 10
|
||||
- 3
|
||||
*
|
||||
- Radeon VII
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 9
|
||||
- 0
|
||||
- 16
|
||||
- 60
|
||||
- 64
|
||||
@@ -791,6 +790,8 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
- 9
|
||||
- 0
|
||||
|
||||
Glossary
|
||||
========
|
||||
@@ -804,18 +805,6 @@ For more information about the terms used, see the
|
||||
Argument to pass to clang in ``--offload-arch`` to compile code for the given
|
||||
architecture.
|
||||
|
||||
**Device major version**
|
||||
|
||||
Indicates the core instruction set of the GPU architecture. For example, a value
|
||||
of 11 would correspond to Navi III (RDNA3).
|
||||
|
||||
**Device minor version**
|
||||
|
||||
Indicates a particular configuration, feature set, or variation within the group
|
||||
represented by the device compute version. For example, different models within
|
||||
the same major version might have varying levels of support for certain features
|
||||
or optimizations.
|
||||
|
||||
**VRAM**
|
||||
|
||||
Amount of memory available on the GPU.
|
||||
@@ -898,6 +887,26 @@ Purpose Vector Registers, used specifically in matrix instructions.
|
||||
Size of the Scalar General Purpose Register (SGPR) file. Holds data used in
|
||||
scalar instructions.
|
||||
|
||||
**GFXIP**
|
||||
|
||||
GFXIP (Graphics IP) is a versioning system used by AMD to identify the GPU
|
||||
architecture and its instruction set. It helps categorize different generations
|
||||
of GPUs and their feature sets.
|
||||
|
||||
**GFXIP major version**
|
||||
|
||||
Defines the GPU's core instruction set and architecture, which determines
|
||||
compatibility with software stacks such as HIP and OpenCL. For example, a GFXIP
|
||||
11 major version corresponds to the RDNA 3 (Navi 3x) architecture, influencing
|
||||
driver support and available compute features.
|
||||
|
||||
**GFXIP minor version**
|
||||
|
||||
Represents specific variations within a GFXIP major version and affects feature sets,
|
||||
optimizations, and driver behavior in software stacks such as HIP and OpenCL. Different
|
||||
GPU models within the same major version can have unique capabilities, impacting
|
||||
performance and supported instructions.
|
||||
|
||||
**GCD**
|
||||
|
||||
Graphics Compute Die.
|
||||
|
||||
@@ -40,11 +40,13 @@ subtrees:
|
||||
title: Training
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: how-to/rocm-for-ai/training/train-a-model.rst
|
||||
title: Train a model
|
||||
- file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm
|
||||
title: Train a model with Megatron-LM
|
||||
- file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training
|
||||
title: Train a model with PyTorch
|
||||
- file: how-to/rocm-for-ai/training/scale-model-training.rst
|
||||
title: Scale model training
|
||||
|
||||
|
||||
- file: how-to/rocm-for-ai/fine-tuning/index.rst
|
||||
title: Fine-tuning LLMs
|
||||
subtrees:
|
||||
@@ -152,7 +154,7 @@ subtrees:
|
||||
- entries:
|
||||
- url: https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf
|
||||
title: AMD Instinct MI200/CDNA2 ISA
|
||||
- url: https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf
|
||||
- url: https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf
|
||||
title: White paper
|
||||
- file: conceptual/gpu-arch/mi100.md
|
||||
title: MI100 microarchitecture
|
||||
@@ -160,7 +162,7 @@ subtrees:
|
||||
- entries:
|
||||
- url: https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf
|
||||
title: AMD Instinct MI100/CDNA1 ISA
|
||||
- url: https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf
|
||||
- url: https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna-white-paper.pdf
|
||||
title: White paper
|
||||
- file: conceptual/iommu.rst
|
||||
title: Input-Output Memory Management Unit (IOMMU)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
rocm-docs-core==1.15.0
|
||||
rocm-docs-core==1.17.0
|
||||
sphinx-reredirects
|
||||
sphinx-sitemap
|
||||
sphinxcontrib.datatemplates==0.11.0
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.10
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements.in
|
||||
# pip-compile docs/sphinx/requirements.in
|
||||
#
|
||||
accessible-pygments==0.0.5
|
||||
# via pydata-sphinx-theme
|
||||
@@ -43,6 +43,8 @@ debugpy==1.8.12
|
||||
# via ipykernel
|
||||
decorator==5.1.1
|
||||
# via ipython
|
||||
defusedxml==0.7.1
|
||||
# via sphinxcontrib-datatemplates
|
||||
deprecated==1.2.15
|
||||
# via pygithub
|
||||
docutils==0.21.2
|
||||
@@ -175,6 +177,7 @@ pyyaml==6.0.2
|
||||
# myst-parser
|
||||
# rocm-docs-core
|
||||
# sphinx-external-toc
|
||||
# sphinxcontrib-datatemplates
|
||||
pyzmq==26.2.0
|
||||
# via
|
||||
# ipykernel
|
||||
@@ -187,7 +190,7 @@ requests==2.32.3
|
||||
# via
|
||||
# pygithub
|
||||
# sphinx
|
||||
rocm-docs-core==1.15.0
|
||||
rocm-docs-core==1.17.0
|
||||
# via -r requirements.in
|
||||
rpds-py==0.22.3
|
||||
# via
|
||||
@@ -215,6 +218,8 @@ sphinx==8.1.3
|
||||
# sphinx-notfound-page
|
||||
# sphinx-reredirects
|
||||
# sphinx-sitemap
|
||||
# sphinxcontrib-datatemplates
|
||||
# sphinxcontrib-runcmd
|
||||
sphinx-book-theme==1.1.3
|
||||
# via rocm-docs-core
|
||||
sphinx-copybutton==0.5.2
|
||||
@@ -226,11 +231,13 @@ sphinx-external-toc==1.0.1
|
||||
sphinx-notfound-page==1.0.4
|
||||
# via rocm-docs-core
|
||||
sphinx-reredirects==0.1.5
|
||||
# via -r requirements.in
|
||||
# via -r docs/sphinx/requirements.in
|
||||
sphinx-sitemap==2.6.0
|
||||
# via -r requirements.in
|
||||
# via -r docs/sphinx/requirements.in
|
||||
sphinxcontrib-applehelp==2.0.0
|
||||
# via sphinx
|
||||
sphinxcontrib-datatemplates==0.11.0
|
||||
# via -r docs/sphinx/requirements.in
|
||||
sphinxcontrib-devhelp==2.0.0
|
||||
# via sphinx
|
||||
sphinxcontrib-htmlhelp==2.1.0
|
||||
@@ -239,6 +246,8 @@ sphinxcontrib-jsmath==1.0.1
|
||||
# via sphinx
|
||||
sphinxcontrib-qthelp==2.0.0
|
||||
# via sphinx
|
||||
sphinxcontrib-runcmd==0.2.0
|
||||
# via sphinxcontrib-datatemplates
|
||||
sphinxcontrib-serializinghtml==2.0.0
|
||||
# via sphinx
|
||||
sqlalchemy==2.0.37
|
||||
|
||||
102
docs/sphinx/static/css/vllm-benchmark.css
Normal file
102
docs/sphinx/static/css/vllm-benchmark.css
Normal file
@@ -0,0 +1,102 @@
|
||||
/* ------------------ Compatibility options grid ------------------ */
|
||||
html {
|
||||
--compat-border-radius: 2px;
|
||||
--compat-accent-color: var(--pst-color-primary);
|
||||
--compat-bg-color: var(--pst-color-on-background);
|
||||
--compat-fg-color: var(--pst-color-primary-text);
|
||||
--compat-head-color: var(--pst-color-surface);
|
||||
--compat-param-hover-color: var(--pst-color-link-hover);
|
||||
--compat-param-selected-color: var(--pst-color-primary);
|
||||
}
|
||||
|
||||
html[data-theme="light"] {
|
||||
--compat-border-color: var(--pst-gray-500);
|
||||
--compat-param-disabled-color: var(--pst-gray-300);
|
||||
}
|
||||
|
||||
html[data-theme="dark"] {
|
||||
--compat-border-color: var(--pst-gray-600);
|
||||
--compat-param-disabled-color: var(--pst-gray-600);
|
||||
}
|
||||
|
||||
div#vllm-benchmark-ud-params-picker.container-fluid {
|
||||
padding: 0 0 1rem 0;
|
||||
}
|
||||
|
||||
div[data-param-k="model"] {
|
||||
background-color: var(--compat-bg-color);
|
||||
padding: 2px;
|
||||
border: solid 1px var(--compat-border-color);
|
||||
font-weight: 500;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
div[data-param-k="model"][data-param-state="selected"] {
|
||||
background-color: var(--compat-param-selected-color);
|
||||
color: var(--compat-fg-color);
|
||||
}
|
||||
|
||||
div[data-param-k="model"][data-param-state="latest-version"] {
|
||||
background-color: var(--compat-param-selected-color);
|
||||
color: var(--compat-fg-color);
|
||||
}
|
||||
|
||||
div[data-param-k="model"][data-param-state="disabled"] {
|
||||
background-color: var(--compat-param-disabled-color);
|
||||
text-decoration: line-through;
|
||||
/* text-decoration-color: var(--pst-color-danger); */
|
||||
cursor: auto;
|
||||
}
|
||||
|
||||
div[data-param-k="model"]:not([data-param-state]):hover {
|
||||
background-color: var(--compat-param-hover-color);
|
||||
}
|
||||
|
||||
div[data-param-k="model-group"] {
|
||||
background-color: var(--compat-bg-color);
|
||||
padding: 2px;
|
||||
border: solid 1px var(--compat-border-color);
|
||||
font-weight: 500;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
div[data-param-k="model-group"][data-param-state="selected"] {
|
||||
background-color: var(--compat-param-selected-color);
|
||||
color: var(--compat-fg-color);
|
||||
}
|
||||
|
||||
div[data-param-k="model-group"][data-param-state="latest-version"] {
|
||||
background-color: var(--compat-param-selected-color);
|
||||
color: var(--compat-fg-color);
|
||||
}
|
||||
|
||||
div[data-param-k="model-group"][data-param-state="disabled"] {
|
||||
background-color: var(--compat-param-disabled-color);
|
||||
text-decoration: line-through;
|
||||
/* text-decoration-color: var(--pst-color-danger); */
|
||||
cursor: auto;
|
||||
}
|
||||
|
||||
div[data-param-k="model-group"]:not([data-param-state]):hover {
|
||||
background-color: var(--compat-param-hover-color);
|
||||
}
|
||||
|
||||
.model-param-head {
|
||||
background-color: var(--compat-head-color);
|
||||
padding: 0.15rem 0.15rem 0.15rem 0.67rem;
|
||||
/* margin: 2px; */
|
||||
border-right: solid 2px var(--compat-accent-color);
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.model-param {
|
||||
/* padding: 2px; */
|
||||
/* margin: 0 2px 0 2px; */
|
||||
/* margin: 2px; */
|
||||
border: solid 1px var(--compat-border-color);
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
.hidden {
|
||||
display: none !important;
|
||||
}
|
||||
Reference in New Issue
Block a user