Compare commits
172 Commits
docs/7.0.0
...
docs/7.9.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e229b10a06 | ||
|
|
b2c4ea3c76 | ||
|
|
c92c9d2b57 | ||
|
|
1a2e4f1cc0 | ||
|
|
e76255db98 | ||
|
|
cdbcad6930 | ||
|
|
dc19d266b4 | ||
|
|
8584fa3b25 | ||
|
|
163d46ca5c | ||
|
|
56f93e72de | ||
|
|
f004891485 | ||
|
|
3c61d4fb05 | ||
|
|
39783dcee0 | ||
|
|
c965b7e98e | ||
|
|
70acbcb27a | ||
|
|
b0fc5959da | ||
|
|
70fc9d8fb8 | ||
|
|
a32210fa7e | ||
|
|
78258e0f85 | ||
|
|
c79d9f74ef | ||
|
|
fb1b78c6f0 | ||
|
|
3a70d75f5e | ||
|
|
61e1f088a1 | ||
|
|
1f6e5c5e04 | ||
|
|
e8a0769842 | ||
|
|
6f9579d052 | ||
|
|
245d53a021 | ||
|
|
35dbbb22bc | ||
|
|
03dc8cee00 | ||
|
|
323e5fd27a | ||
|
|
b11fd7b492 | ||
|
|
5e2efa05a6 | ||
|
|
29a90f0271 | ||
|
|
c06242bb89 | ||
|
|
68e8453ca5 | ||
|
|
503b8bcc86 | ||
|
|
e3d97d339a | ||
|
|
978c58d196 | ||
|
|
a366048b64 | ||
|
|
4c3e33c291 | ||
|
|
89758e67d8 | ||
|
|
5d0f201b4d | ||
|
|
e3677d89a6 | ||
|
|
f20edab8fc | ||
|
|
6f84d50011 | ||
|
|
57dd082f28 | ||
|
|
eeea0d2180 | ||
|
|
93c6d17922 | ||
|
|
f91c2b9b4a | ||
|
|
5e6b66ca39 | ||
|
|
6b8b359d03 | ||
|
|
38e659e5f0 | ||
|
|
0894547f5a | ||
|
|
aca31170c4 | ||
|
|
d21ec9eea5 | ||
|
|
189c269350 | ||
|
|
774cb7a1b3 | ||
|
|
024cb4db76 | ||
|
|
945fb286f7 | ||
|
|
ee93101541 | ||
|
|
e31841312b | ||
|
|
41b5298659 | ||
|
|
58790154b2 | ||
|
|
6f7f73ac0b | ||
|
|
b2e3bc8565 | ||
|
|
52979e2fdb | ||
|
|
0ea5216ace | ||
|
|
2e1b4dd5ee | ||
|
|
5c7b993c0c | ||
|
|
2d79b3c4bd | ||
|
|
fd59b5fbac | ||
|
|
0a643f4686 | ||
|
|
d9e5744f7a | ||
|
|
ccb849ec02 | ||
|
|
42d4867964 | ||
|
|
375359a5dd | ||
|
|
e92745f1ff | ||
|
|
0fa72358d3 | ||
|
|
6fec268a4e | ||
|
|
ff14cd1ff5 | ||
|
|
8f65688653 | ||
|
|
33d1493adb | ||
|
|
4b6c7776a2 | ||
|
|
af811daa1b | ||
|
|
d6c045e482 | ||
|
|
78b24cad39 | ||
|
|
753a94c0bb | ||
|
|
6ecad57c62 | ||
|
|
977554809a | ||
|
|
7b00f4493b | ||
|
|
95c439a272 | ||
|
|
94e04fbdc0 | ||
|
|
7ab59de8af | ||
|
|
175c817563 | ||
|
|
25516d312e | ||
|
|
30c345629a | ||
|
|
210dc94bbb | ||
|
|
a54023ccb8 | ||
|
|
17e3362dc7 | ||
|
|
0f9c0d884d | ||
|
|
c890de4b16 | ||
|
|
4ea77ab515 | ||
|
|
c0512612f4 | ||
|
|
1c81ac3747 | ||
|
|
4bafa42e52 | ||
|
|
493801e670 | ||
|
|
1a5152b7b3 | ||
|
|
874c881012 | ||
|
|
bdcaeea74c | ||
|
|
b02669acf7 | ||
|
|
844f10b2b1 | ||
|
|
d6c14920b4 | ||
|
|
4affe10a7c | ||
|
|
81341ef435 | ||
|
|
abacd328f9 | ||
|
|
80b2fb6e26 | ||
|
|
b53e8decfc | ||
|
|
5fcc2eafde | ||
|
|
2eb0d77bc6 | ||
|
|
d84b41908f | ||
|
|
986f8284d1 | ||
|
|
d92d9268dc | ||
|
|
1629d3f0ea | ||
|
|
6cf6b34b2e | ||
|
|
c35a0a121a | ||
|
|
412e383654 | ||
|
|
39f6fc187d | ||
|
|
05b480fb28 | ||
|
|
4fa44d90db | ||
|
|
c9ef13d823 | ||
|
|
f02172050b | ||
|
|
154dbe297a | ||
|
|
993a0a4fd4 | ||
|
|
c03662f410 | ||
|
|
442d7e4750 | ||
|
|
a09a8f517e | ||
|
|
0bbaab645d | ||
|
|
4b80405e2e | ||
|
|
d92e5b6c12 | ||
|
|
91fce2e134 | ||
|
|
27d53cf082 | ||
|
|
bc084246be | ||
|
|
9827ba7ff2 | ||
|
|
bafda50153 | ||
|
|
cae65c6c43 | ||
|
|
6a66167486 | ||
|
|
0f3543d6e8 | ||
|
|
678691c3d7 | ||
|
|
5cb3debed9 | ||
|
|
dd5d710727 | ||
|
|
eca1ecde92 | ||
|
|
ed1e414710 | ||
|
|
20c90fc406 | ||
|
|
6e39614b22 | ||
|
|
f7873ac74e | ||
|
|
a86fba556b | ||
|
|
7603fed080 | ||
|
|
9932cd4ac2 | ||
|
|
e8d104124f | ||
|
|
26f708da87 | ||
|
|
5a5e4dbb6e | ||
|
|
1c3dae75e1 | ||
|
|
bab853a0d3 | ||
|
|
5c7ccb3c26 | ||
|
|
f216b371a0 | ||
|
|
37faf170b1 | ||
|
|
8c40d14d7e | ||
|
|
d5101532f7 | ||
|
|
ef4e7ca1fe | ||
|
|
be68246824 | ||
|
|
1626ee4d8b | ||
|
|
7316031fe6 |
@@ -79,7 +79,7 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- task: Bash@3
|
||||
displayName: Add lit to PATH
|
||||
inputs:
|
||||
|
||||
@@ -131,7 +131,7 @@ jobs:
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
@@ -212,7 +212,7 @@ jobs:
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
|
||||
174
.azuredevops/components/aqlprofile.yml
Normal file
@@ -0,0 +1,174 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: aqlprofile
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
type: boolean
|
||||
default: false
|
||||
- name: aptPackages
|
||||
type: object
|
||||
default:
|
||||
- cmake
|
||||
- git
|
||||
- ninja-build
|
||||
- python3-pip
|
||||
- name: rocmDependencies
|
||||
type: object
|
||||
default:
|
||||
- clr
|
||||
- llvm-project
|
||||
- ROCR-Runtime
|
||||
- name: rocmTestDependencies
|
||||
type: object
|
||||
default:
|
||||
- clr
|
||||
- llvm-project
|
||||
- ROCR-Runtime
|
||||
- rocprofiler-register
|
||||
|
||||
- name: jobMatrix
|
||||
type: object
|
||||
default:
|
||||
buildJobs:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
testJobs:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.os }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ variables.MEDIUM_BUILD_POOL }}
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||
parameters:
|
||||
dependencyList:
|
||||
- gtest
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
os: ${{ job.os }}
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
consolidateBuildAndInstall: true
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
|
||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||
-DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/aqlprofile/cmake_modules
|
||||
-DAQLPROFILE_BUILD_TESTS=ON
|
||||
-DGPU_TARGETS=${{ job.target }}
|
||||
-GNinja
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
- ${{ if eq(job.os, 'ubuntu2204') }}:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
|
||||
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- checkout: none
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
preTargetFilter: ${{ parameters.componentName }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
os: ${{ job.os }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
testDir: $(Agent.BuildDirectory)/rocm/share/hsa-amd-aqlprofile/
|
||||
testExecutable: ./run_tests.sh
|
||||
testParameters: ''
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
@@ -77,6 +77,7 @@ parameters:
|
||||
- clr
|
||||
- hipBLAS-common
|
||||
- llvm-project
|
||||
- rocm-cmake
|
||||
- rocminfo
|
||||
- rocm_smi_lib
|
||||
- rocprofiler-register
|
||||
@@ -144,7 +145,7 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
|
||||
@@ -40,6 +40,7 @@ parameters:
|
||||
- gfortran
|
||||
- libgfortran5
|
||||
- libopenblas-dev
|
||||
- liblapack-dev
|
||||
- name: pipModules
|
||||
type: object
|
||||
default:
|
||||
@@ -53,6 +54,7 @@ parameters:
|
||||
- hipSPARSE
|
||||
- llvm-project
|
||||
- rocBLAS
|
||||
- rocm-cmake
|
||||
- rocm_smi_lib
|
||||
- rocminfo
|
||||
- rocprofiler-register
|
||||
@@ -66,6 +68,7 @@ parameters:
|
||||
- llvm-project
|
||||
- hipBLAS-common
|
||||
- hipBLASLt
|
||||
- rocm-cmake
|
||||
- rocBLAS
|
||||
- rocminfo
|
||||
- rocprofiler-register
|
||||
@@ -109,7 +112,7 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
@@ -125,10 +128,13 @@ jobs:
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
# NOTE: content between `---` is for transition support between old/new build systems
|
||||
# and should be removed once transition is complete.
|
||||
# -----------------------------
|
||||
# Build and install gtest and lapack
|
||||
# $(Pipeline.Workspace)/deps is a temporary folder for the build process
|
||||
# $(Pipeline.Workspace)/s/deps is part of the hipSPARSELt repo
|
||||
- script: mkdir $(Pipeline.Workspace)/deps
|
||||
- script: mkdir -p $(Pipeline.Workspace)/deps
|
||||
displayName: Create temp folder for external dependencies
|
||||
# hipSPARSELt already has a CMake script for external deps, so we can just run that
|
||||
# https://github.com/ROCm/hipSPARSELt/blob/develop/deps/CMakeLists.txt
|
||||
@@ -144,22 +150,35 @@ jobs:
|
||||
- script: sudo make install
|
||||
displayName: Install hipSPARSELt external dependencies
|
||||
workingDirectory: $(Pipeline.Workspace)/deps
|
||||
# -----------------------------
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
# NOTE: the following options are old build only
|
||||
# and can be removed after full transition to new build
|
||||
# -DAMDGPU_TARGETS=${{ job.target }}
|
||||
# -DCMAKE_Fortran_COMPILER=f95
|
||||
# -DTensile_LOGIC=
|
||||
# -DTensile_CPU_THREADS=
|
||||
# -DTensile_LIBRARY_FORMAT=msgpack
|
||||
# -DROCM_PATH=$(Agent.BuildDirectory)/rocm
|
||||
# -DBUILD_CLIENTS_TESTS=ON
|
||||
# -DBUILD_USE_LOCAL_TENSILE=OFF
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
|
||||
-DCMAKE_Fortran_COMPILER=f95
|
||||
-DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
|
||||
-DGPU_TARGETS=${{ job.target }}
|
||||
-DAMDGPU_TARGETS=${{ job.target }}
|
||||
-DCMAKE_Fortran_COMPILER=f95
|
||||
-DTensile_LOGIC=
|
||||
-DTensile_CPU_THREADS=
|
||||
-DTensile_LIBRARY_FORMAT=msgpack
|
||||
-DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
|
||||
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
|
||||
-DBUILD_CLIENTS_TESTS=ON
|
||||
-DBUILD_USE_LOCAL_TENSILE=OFF
|
||||
-DHIPSPARSELT_ENABLE_FETCH=ON
|
||||
-GNinja
|
||||
${{ if ne(parameters.sparseCheckoutDir, '') }}:
|
||||
cmakeSourceDir: $(Build.SourcesDirectory)/projects/hipsparselt
|
||||
|
||||
@@ -77,6 +77,7 @@ jobs:
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/llvm
|
||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
|
||||
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DHIPTENSOR_BUILD_TESTS=ON
|
||||
|
||||
@@ -71,7 +71,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
|
||||
@@ -39,6 +39,9 @@ parameters:
|
||||
- python3
|
||||
- python3-dev
|
||||
- python3-pip
|
||||
- libgtest-dev
|
||||
- libboost-filesystem-dev
|
||||
- libboost-program-options-dev
|
||||
- name: pipModules
|
||||
type: object
|
||||
default:
|
||||
@@ -107,8 +110,12 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||
parameters:
|
||||
dependencyList:
|
||||
- gtest
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
@@ -125,7 +132,7 @@ jobs:
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
|
||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||
-DORIGAMI_BUILD_SHARED_LIBS=ON
|
||||
-DORIGAMI_ENABLE_PYTHON=ON
|
||||
@@ -206,7 +213,15 @@ jobs:
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin'
|
||||
testExecutable: './origami-tests'
|
||||
testParameters: '--yaml origami-tests.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
|
||||
- script: |
|
||||
set -e
|
||||
export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
|
||||
|
||||
echo "--- Running origami_test.py ---"
|
||||
|
||||
@@ -83,7 +83,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
|
||||
@@ -1,10 +1,29 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: rdc
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -33,6 +52,7 @@ parameters:
|
||||
- clr
|
||||
- hipBLAS-common
|
||||
- hipBLASLt
|
||||
- hipRAND
|
||||
- llvm-project
|
||||
- rocBLAS
|
||||
- rocm-cmake
|
||||
@@ -43,6 +63,7 @@ parameters:
|
||||
- rocprofiler
|
||||
- rocprofiler-register
|
||||
- rocprofiler-sdk
|
||||
- rocRAND
|
||||
- ROCR-Runtime
|
||||
- name: rocmTestDependencies
|
||||
type: object
|
||||
@@ -74,7 +95,11 @@ parameters:
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: rdc_build_${{ job.target }}
|
||||
- job: ${{ parameters.componentName }}_build_${{ job.target }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.target }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
@@ -85,16 +110,22 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
parameters:
|
||||
cmakeVersion: '3.25.0'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
# Build grpc
|
||||
- task: Bash@3
|
||||
displayName: 'git clone grpc'
|
||||
@@ -104,6 +135,7 @@ jobs:
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
cmakeBuildDir: $(Build.SourcesDirectory)/grpc/build
|
||||
cmakeSourceDir: $(Build.SourcesDirectory)/grpc
|
||||
installDir: $(Build.SourcesDirectory)/bin
|
||||
@@ -117,6 +149,7 @@ jobs:
|
||||
-GNinja
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
|
||||
-DGRPC_ROOT="$(Build.SourcesDirectory)/bin"
|
||||
@@ -126,9 +159,12 @@ jobs:
|
||||
-DAMDGPU_TARGETS=${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
@@ -136,60 +172,64 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: rdc_test_${{ job.target }}
|
||||
dependsOn: rdc_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
- name: ROCM_PATH
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
- name: ROCM_DIR
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- task: Bash@3
|
||||
displayName: Setup test environment
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rdcd /usr/sbin/rdcd
|
||||
echo $(Agent.BuildDirectory)/rocm/lib/rdc/grpc/lib | sudo tee /etc/ld.so.conf.d/grpc.conf
|
||||
sudo ldconfig -v
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- task: Bash@3
|
||||
displayName: Test rdc
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: >-
|
||||
$(Agent.BuildDirectory)/rocm/share/rdc/rdctst_tests/rdctst
|
||||
--batch_mode
|
||||
--start_rdcd
|
||||
--unauth_comm
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
extraPaths: /home/user/workspace/rocm/bin
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: ${{ parameters.componentName }}_test_${{ job.target }}
|
||||
dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
- name: ROCM_PATH
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
- name: ROCM_DIR
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- checkout: none
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- task: Bash@3
|
||||
displayName: Setup test environment
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rdcd /usr/sbin/rdcd
|
||||
echo $(Agent.BuildDirectory)/rocm/lib/rdc/grpc/lib | sudo tee /etc/ld.so.conf.d/grpc.conf
|
||||
sudo ldconfig -v
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- task: Bash@3
|
||||
displayName: Test rdc
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: >-
|
||||
$(Agent.BuildDirectory)/rocm/share/rdc/rdctst_tests/rdctst
|
||||
--batch_mode
|
||||
--start_rdcd
|
||||
--unauth_comm
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
extraPaths: /home/user/workspace/rocm/bin
|
||||
|
||||
@@ -70,6 +70,7 @@ parameters:
|
||||
- hipBLAS-common
|
||||
- hipBLASLt
|
||||
- llvm-project
|
||||
- rocm-cmake
|
||||
- rocminfo
|
||||
- rocprofiler-register
|
||||
- rocm_smi_lib
|
||||
@@ -154,7 +155,7 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
|
||||
@@ -210,7 +210,7 @@ jobs:
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin/rocprim'
|
||||
extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }} -E device_merge_inplace'
|
||||
extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }}'
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
|
||||
@@ -14,9 +14,11 @@ parameters:
|
||||
type: object
|
||||
default:
|
||||
- cmake
|
||||
- libdw-dev
|
||||
- libglfw3-dev
|
||||
- libmsgpack-dev
|
||||
- libtbb-dev
|
||||
- libva-amdgpu-dev
|
||||
- ninja-build
|
||||
- python3-pip
|
||||
- name: rocmDependencies
|
||||
@@ -33,16 +35,21 @@ parameters:
|
||||
- hipRAND
|
||||
- hipSOLVER
|
||||
- hipSPARSE
|
||||
- hipTensor
|
||||
- llvm-project
|
||||
- MIOpen
|
||||
- rocBLAS
|
||||
- rocFFT
|
||||
- rocJPEG
|
||||
- rocPRIM
|
||||
- rocprofiler-register
|
||||
- rocprofiler-sdk
|
||||
- ROCR-Runtime
|
||||
- rocRAND
|
||||
- rocSOLVER
|
||||
- rocSPARSE
|
||||
- rocThrust
|
||||
- rocWMMA
|
||||
- name: rocmTestDependencies
|
||||
type: object
|
||||
default:
|
||||
@@ -57,18 +64,22 @@ parameters:
|
||||
- hipRAND
|
||||
- hipSOLVER
|
||||
- hipSPARSE
|
||||
- hipTensor
|
||||
- llvm-project
|
||||
- rocBLAS
|
||||
- rocFFT
|
||||
- rocminfo
|
||||
- rocPRIM
|
||||
- rocJPEG
|
||||
- rocprofiler-register
|
||||
- rocprofiler-sdk
|
||||
- ROCR-Runtime
|
||||
- rocRAND
|
||||
- rocSOLVER
|
||||
- rocSPARSE
|
||||
- rocThrust
|
||||
- roctracer
|
||||
- rocWMMA
|
||||
|
||||
- name: jobMatrix
|
||||
type: object
|
||||
@@ -97,6 +108,10 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
registerROCmPackages: true
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
parameters:
|
||||
cmakeVersion: '3.25.0'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
@@ -158,6 +173,10 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
registerROCmPackages: true
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
parameters:
|
||||
cmakeVersion: '3.25.0'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
|
||||
@@ -102,7 +102,7 @@ jobs:
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
|
||||
@@ -213,6 +213,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
testDir: $(Agent.BuildDirectory)/s/build
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
|
||||
@@ -226,8 +226,11 @@ jobs:
|
||||
echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
cmakeSourceDir: $(Agent.BuildDirectory)/s/projects/rocprofiler-systems
|
||||
# build flags reference: https://rocm.docs.amd.com/projects/omnitrace/en/latest/install/install.html
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_INSTALL_PREFIX=$(Agent.BuildDirectory)/rocprofiler-systems
|
||||
-DROCPROFSYS_USE_PYTHON=ON
|
||||
-DROCPROFSYS_BUILD_TESTING=ON
|
||||
-DROCPROFSYS_BUILD_DYNINST=ON
|
||||
-DROCPROFSYS_BUILD_LIBUNWIND=ON
|
||||
@@ -245,11 +248,13 @@ jobs:
|
||||
displayName: Set up rocprofiler-systems env
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: source share/rocprofiler-systems/setup-env.sh
|
||||
workingDirectory: build
|
||||
script: source $(Agent.BuildDirectory)/rocprofiler-systems/share/rocprofiler-systems/setup-env.sh
|
||||
workingDirectory: $(Agent.BuildDirectory)/rocprofiler-systems/share/rocprofiler-systems
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
testDir: $(Agent.BuildDirectory)/s/build/tests/
|
||||
testParameters: '--output-on-failure'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
@@ -1,10 +1,15 @@
|
||||
parameters:
|
||||
- name: cmakeVersion
|
||||
type: string
|
||||
default: '3.31.0'
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
displayName: Install CMake 3.31
|
||||
displayName: Install CMake ${{ parameters.cmakeVersion }}
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
CMAKE_VERSION=3.31.0
|
||||
CMAKE_VERSION=${{ parameters.cmakeVersion }}
|
||||
CMAKE_ROOT="$(Pipeline.Workspace)/cmake"
|
||||
|
||||
echo "Downloading CMake $CMAKE_VERSION..."
|
||||
@@ -46,6 +46,10 @@ parameters:
|
||||
pipelineId: 115
|
||||
developBranch: aomp-dev
|
||||
hasGpuTarget: false
|
||||
aqlprofile:
|
||||
pipelineId: 365
|
||||
developBranch: develop
|
||||
hasGpuTarget: false
|
||||
clr:
|
||||
pipelineId: 335
|
||||
developBranch: develop
|
||||
@@ -126,13 +130,17 @@ parameters:
|
||||
pipelineId: 80
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
origami:
|
||||
pipelineId: 364
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
rccl:
|
||||
pipelineId: 107
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
rdc:
|
||||
pipelineId: 100
|
||||
developBranch: amd-staging
|
||||
pipelineId: 360
|
||||
developBranch: develop
|
||||
hasGpuTarget: false
|
||||
rocAL:
|
||||
pipelineId: 151
|
||||
@@ -219,8 +227,8 @@ parameters:
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
rocprofiler-systems:
|
||||
pipelineId: 255
|
||||
developBranch: amd-staging
|
||||
pipelineId: 345
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
rocPyDecode:
|
||||
pipelineId: 239
|
||||
|
||||
@@ -36,6 +36,7 @@ Arb
|
||||
Autocast
|
||||
BARs
|
||||
BatchNorm
|
||||
BKC
|
||||
BLAS
|
||||
BMC
|
||||
BabelStream
|
||||
@@ -43,6 +44,7 @@ Blit
|
||||
Blockwise
|
||||
Bluefield
|
||||
Bootloader
|
||||
Broadcom
|
||||
CAS
|
||||
CCD
|
||||
CDNA
|
||||
@@ -72,11 +74,13 @@ CU
|
||||
CUDA
|
||||
CUs
|
||||
CXX
|
||||
CX
|
||||
Cavium
|
||||
CentOS
|
||||
ChatGPT
|
||||
CoRR
|
||||
Codespaces
|
||||
ComfyUI
|
||||
Commitizen
|
||||
CommonMark
|
||||
Concretized
|
||||
@@ -118,6 +122,8 @@ Dependabot
|
||||
Deprecations
|
||||
DevCap
|
||||
DirectX
|
||||
Disaggregated
|
||||
disaggregated
|
||||
Dockerfile
|
||||
Dockerized
|
||||
Doxygen
|
||||
@@ -127,6 +133,7 @@ ENDPGM
|
||||
EPYC
|
||||
ESXi
|
||||
EoS
|
||||
etcd
|
||||
fas
|
||||
FBGEMM
|
||||
FIFOs
|
||||
@@ -142,6 +149,8 @@ Filesystem
|
||||
FindDb
|
||||
Flang
|
||||
FlashAttention
|
||||
FlashInfer’s
|
||||
FlashInfer
|
||||
FluxBenchmark
|
||||
Fortran
|
||||
Fuyu
|
||||
@@ -170,6 +179,7 @@ GLXT
|
||||
Gloo
|
||||
GMI
|
||||
GPG
|
||||
GPGPU
|
||||
GPR
|
||||
GPT
|
||||
GPU
|
||||
@@ -178,6 +188,7 @@ GPUs
|
||||
Graphbolt
|
||||
GraphSage
|
||||
GRBM
|
||||
GRE
|
||||
GenAI
|
||||
GenZ
|
||||
GitHub
|
||||
@@ -205,6 +216,7 @@ Higgs
|
||||
href
|
||||
Hyperparameters
|
||||
Huggingface
|
||||
HunyuanVideo
|
||||
IB
|
||||
ICD
|
||||
ICT
|
||||
@@ -287,6 +299,7 @@ MVAPICH
|
||||
MVFFR
|
||||
Makefile
|
||||
Makefiles
|
||||
ManyLinux
|
||||
Matplotlib
|
||||
Matrox
|
||||
MaxText
|
||||
@@ -301,9 +314,11 @@ MirroredStrategy
|
||||
Mixtral
|
||||
MosaicML
|
||||
MoEs
|
||||
Mooncake
|
||||
Mpops
|
||||
Multicore
|
||||
Multithreaded
|
||||
MXFP
|
||||
MyEnvironment
|
||||
MyST
|
||||
NANOO
|
||||
@@ -365,6 +380,7 @@ perf
|
||||
PEQT
|
||||
PIL
|
||||
PILImage
|
||||
PLDM
|
||||
POR
|
||||
PRNG
|
||||
PRs
|
||||
@@ -429,6 +445,7 @@ SBIOS
|
||||
SCA
|
||||
ScaledGEMM
|
||||
SDK
|
||||
SDKs
|
||||
SDMA
|
||||
SDPA
|
||||
SDRAM
|
||||
@@ -445,6 +462,7 @@ SKU
|
||||
SKUs
|
||||
SLES
|
||||
SLURM
|
||||
Slurm
|
||||
SMEM
|
||||
SMFMA
|
||||
SMI
|
||||
@@ -473,6 +491,8 @@ TCI
|
||||
TCIU
|
||||
TCP
|
||||
TCR
|
||||
TVM
|
||||
TheRock
|
||||
THREADGROUPS
|
||||
threadgroups
|
||||
TensorRT
|
||||
@@ -615,6 +635,7 @@ coalescable
|
||||
codename
|
||||
collater
|
||||
comgr
|
||||
compat
|
||||
completers
|
||||
composable
|
||||
concretization
|
||||
@@ -662,6 +683,7 @@ detections
|
||||
dev
|
||||
devicelibs
|
||||
devsel
|
||||
dgl
|
||||
dimensionality
|
||||
disambiguates
|
||||
distro
|
||||
@@ -701,6 +723,7 @@ githooks
|
||||
github
|
||||
globals
|
||||
gnupg
|
||||
gpu
|
||||
grayscale
|
||||
gx
|
||||
gzip
|
||||
@@ -755,6 +778,7 @@ invariants
|
||||
invocating
|
||||
ipo
|
||||
jax
|
||||
json
|
||||
kdb
|
||||
kfd
|
||||
kv
|
||||
@@ -776,6 +800,7 @@ lossy
|
||||
macOS
|
||||
matchers
|
||||
maxtext
|
||||
megablocks
|
||||
megatron
|
||||
microarchitecture
|
||||
migraphx
|
||||
@@ -860,6 +885,7 @@ radeon
|
||||
rccl
|
||||
rdc
|
||||
rdma
|
||||
redhat
|
||||
reStructuredText
|
||||
redirections
|
||||
refactorization
|
||||
@@ -912,6 +938,7 @@ roctracer
|
||||
rst
|
||||
runtime
|
||||
runtimes
|
||||
ryzen
|
||||
ResNet
|
||||
sL
|
||||
scalability
|
||||
@@ -934,6 +961,7 @@ softmax
|
||||
spack
|
||||
spmm
|
||||
src
|
||||
stanford
|
||||
stochastically
|
||||
strided
|
||||
subcommand
|
||||
@@ -953,6 +981,7 @@ tabindex
|
||||
targetContainer
|
||||
td
|
||||
tensorfloat
|
||||
tf
|
||||
th
|
||||
tokenization
|
||||
tokenize
|
||||
@@ -965,11 +994,13 @@ toolset
|
||||
toolsets
|
||||
torchtitan
|
||||
torchvision
|
||||
tp
|
||||
tqdm
|
||||
tracebacks
|
||||
txt
|
||||
TopK
|
||||
uarch
|
||||
ubuntu
|
||||
uncached
|
||||
uncacheable
|
||||
uncorrectable
|
||||
@@ -1016,6 +1047,8 @@ writebacks
|
||||
wrreq
|
||||
wzo
|
||||
xargs
|
||||
xDiT
|
||||
xdit
|
||||
xGMI
|
||||
xPacked
|
||||
xz
|
||||
|
||||
9169
CHANGELOG.md
3122
RELEASE.md
@@ -1,7 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<manifest>
|
||||
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
|
||||
<default revision="refs/tags/rocm-7.0.0"
|
||||
<default revision="refs/tags/rocm-7.0.2"
|
||||
remote="rocm-org"
|
||||
sync-c="true"
|
||||
sync-j="4" />
|
||||
@@ -41,7 +41,6 @@
|
||||
<project groups="mathlibs" name="MIVisionX" />
|
||||
<project groups="mathlibs" name="ROCmValidationSuite" />
|
||||
<project groups="mathlibs" name="composable_kernel" />
|
||||
<project groups="mathlibs" name="hipSOLVER" />
|
||||
<project groups="mathlibs" name="hipTensor" />
|
||||
<project groups="mathlibs" name="hipfort" />
|
||||
<project groups="mathlibs" name="rccl" />
|
||||
@@ -57,7 +56,6 @@
|
||||
<project groups="mathlibs" name="rocm-libraries" />
|
||||
<project groups="mathlibs" name="rocPyDecode" />
|
||||
<project groups="mathlibs" name="rocSHMEM" />
|
||||
<project groups="mathlibs" name="rocSOLVER" />
|
||||
<project groups="mathlibs" name="rocWMMA" />
|
||||
<project groups="mathlibs" name="rocm-cmake" />
|
||||
<project groups="mathlibs" name="rpp" />
|
||||
|
||||
@@ -1,134 +0,0 @@
|
||||
ROCm Version,7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
|
||||
:ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
|
||||
,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
|
||||
,,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
|
||||
,"RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
|
||||
,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
|
||||
,SLES 15 SP7,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
|
||||
,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
|
||||
,"Oracle Linux 9, 8 [#ol-700-mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
|
||||
,Debian 12,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
|
||||
,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-630-past-60]_,Azure Linux 3.0 [#az-mi300x-630-past-60]_,,,,,,,,,,,,
|
||||
,Rocky Linux 9,,,,,,,,,,,,,,,,,,
|
||||
,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,,,,,,,,,,,,,,,,,,
|
||||
,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
|
||||
,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
|
||||
,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
|
||||
,RDNA4,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
|
||||
,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
|
||||
,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
|
||||
,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
|
||||
:doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx950,,,,,,,,,,,,,,,,,,
|
||||
,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
|
||||
,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
|
||||
,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
|
||||
,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
|
||||
,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
|
||||
,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
|
||||
,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
|
||||
,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.7, 2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
|
||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.19.1, 2.18.1","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.6.0,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
||||
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,
|
||||
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,N/A,N/A,N/A,85f95ae,85f95ae,85f95ae,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
|
||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
|
||||
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`,N/A,N/A,N/A,N/A,N/A,0.7.0,0.7.0,0.7.0,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
|
||||
:doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
|
||||
`UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
|
||||
`UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
|
||||
Thrust,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
CUB,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
KMD & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
|
||||
:doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
|
||||
:doc:`MIGraphX <amdmigraphx:index>`,2.13.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
|
||||
:doc:`MIOpen <miopen:index>`,3.5.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`MIVisionX <mivisionx:index>`,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
|
||||
:doc:`rocAL <rocal:index>`,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`rocDecode <rocdecode:index>`,1.0.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
|
||||
:doc:`rocJPEG <rocjpeg:index>`,1.1.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`rocPyDecode <rocpydecode:index>`,0.6.0,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`RPP <rpp:index>`,2.0.0,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
|
||||
:doc:`RCCL <rccl:index>`,2.26.6,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
|
||||
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
|
||||
:doc:`hipBLAS <hipblas:index>`,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
|
||||
:doc:`hipBLASLt <hipblaslt:index>`,1.0.0,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
|
||||
:doc:`hipFFT <hipfft:index>`,1.0.20,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
|
||||
:doc:`hipfort <hipfort:index>`,0.7.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
|
||||
:doc:`hipRAND <hiprand:index>`,3.0.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
|
||||
:doc:`hipSOLVER <hipsolver:index>`,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
|
||||
:doc:`rocALUTION <rocalution:index>`,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
|
||||
:doc:`rocBLAS <rocblas:index>`,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
|
||||
:doc:`rocRAND <rocrand:index>`,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
|
||||
:doc:`Tensile <tensile:src/index>`,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
|
||||
:doc:`hipCUB <hipcub:index>`,4.0.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`hipTensor <hiptensor:index>`,2.0.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
|
||||
:doc:`rocPRIM <rocprim:index>`,4.0.0,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`rocThrust <rocthrust:index>`,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
SUPPORT LIBS,,,,,,,,,,,,,,,,,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
|
||||
:doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`HIPIFY <hipify:index>`,20.0.0,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
|
||||
:doc:`ROCdbgapi <rocdbgapi:index>`,0.77.3,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
|
||||
:doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
|
||||
`rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
|
||||
:doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
|
||||
`clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
|
||||
:doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
`Flang <https://github.com/ROCm/flang>`_,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`llvm-project <llvm-project:index>`,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
:doc:`HIP <hip:index>`,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
|
||||
:doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
|
||||
|
1051
docs/compatibility/compatibility-matrix.md
Normal file
@@ -1,268 +0,0 @@
|
||||
.. meta::
|
||||
:description: ROCm compatibility matrix
|
||||
:keywords: GPU, architecture, hardware, compatibility, system, requirements, components, libraries
|
||||
|
||||
**************************************************************************************
|
||||
Compatibility matrix
|
||||
**************************************************************************************
|
||||
|
||||
Use this matrix to view the ROCm compatibility and system requirements across successive major and minor releases.
|
||||
|
||||
You can also refer to the :ref:`past versions of ROCm compatibility matrix<past-rocm-compatibility-matrix>`.
|
||||
|
||||
Accelerators and GPUs listed in the following table support compute workloads (no display
|
||||
information or graphics). If you’re using ROCm with AMD Radeon or Radeon Pro GPUs for graphics
|
||||
workloads, see the `Use ROCm on Radeon GPU documentation
|
||||
<https://rocm.docs.amd.com/projects/radeon/en/latest/docs/compatibility.html>`_ to verify
|
||||
compatibility and system requirements.
|
||||
|
||||
.. |br| raw:: html
|
||||
|
||||
<br/>
|
||||
|
||||
.. container:: format-big-table
|
||||
|
||||
.. csv-table::
|
||||
:header: "ROCm Version", "7.0.0", "6.4.3", "6.3.0"
|
||||
:stub-columns: 1
|
||||
|
||||
:ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2
|
||||
,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5
|
||||
,"RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.5, 9.4"
|
||||
,RHEL 8.10 [#rhel-700]_,RHEL 8.10 [#rhel-700],RHEL 8.10 [#rhel-700]
|
||||
,SLES 15 SP7,"SLES 15 SP7, SP6","SLES 15 SP6, SP5"
|
||||
,"Oracle Linux 9, 8 [#ol-700-mi300x]_","Oracle Linux 9, 8 [#ol-mi300x]_",Oracle Linux 8.10 [#ol-mi300x]_
|
||||
,Debian 12,Debian 12 [#single-node]_,
|
||||
,Azure Linux 3.0 [#az-mi300x]_,Azure Linux 3.0 [#az-mi300x]_,
|
||||
,Rocky Linux 9 [#rl-700]_,,
|
||||
,.. _architecture-support-compatibility-matrix:,,
|
||||
:doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,,
|
||||
,CDNA3,CDNA3,CDNA3
|
||||
,CDNA2,CDNA2,CDNA2
|
||||
,CDNA,CDNA,CDNA
|
||||
,RDNA4,RDNA4,
|
||||
,RDNA3,RDNA3,RDNA3
|
||||
,RDNA2,RDNA2,RDNA2
|
||||
,.. _gpu-support-compatibility-matrix:,,
|
||||
:doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx950,,
|
||||
,gfx1201 [#RDNA-OS]_,gfx1201 [#RDNA-OS]_,
|
||||
,gfx1200 [#RDNA-OS]_,gfx1200 [#RDNA-OS]_,
|
||||
,gfx1101 [#RDNA-OS]_ [#7700XT-OS]_,gfx1101 [#RDNA-OS]_ [#7700XT-OS]_,
|
||||
,gfx1100,gfx1100,gfx1100
|
||||
,gfx1030,gfx1030,gfx1030
|
||||
,gfx942,gfx942,gfx942
|
||||
,gfx90a,gfx90a,gfx90a
|
||||
,gfx908,gfx908,gfx908
|
||||
,,,
|
||||
FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix:,,
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.7, 2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 2.1, 2.0, 1.13"
|
||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.19.1, 2.18.1","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1"
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.6.0,0.4.35,0.4.31
|
||||
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A
|
||||
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,85f95ae
|
||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A
|
||||
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`,N/A,N/A,0.7.0
|
||||
:doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.20.0,1.17.3
|
||||
,,,
|
||||
THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
|
||||
`UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.3.0,>=1.3.0
|
||||
`UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.15.0,>=1.15.0
|
||||
,,,
|
||||
THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix:,,
|
||||
Thrust,2.6.0,2.5.0,2.3.2
|
||||
CUB,2.6.0,2.5.0,2.3.2
|
||||
,,,
|
||||
KMD & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
|
||||
:doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
|
||||
,,,
|
||||
ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix:,,
|
||||
:doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0
|
||||
:doc:`MIGraphX <amdmigraphx:index>`,2.13.0,2.12.0,2.11.0
|
||||
:doc:`MIOpen <miopen:index>`,3.5.0,3.4.0,3.3.0
|
||||
:doc:`MIVisionX <mivisionx:index>`,3.3.0,3.2.0,3.1.0
|
||||
:doc:`rocAL <rocal:index>`,2.3.0,2.2.0,2.1.0
|
||||
:doc:`rocDecode <rocdecode:index>`,1.0.0,0.10.0,0.8.0
|
||||
:doc:`rocJPEG <rocjpeg:index>`,1.1.0,0.8.0,0.6.0
|
||||
:doc:`rocPyDecode <rocpydecode:index>`,0.6.0,0.3.1,0.2.0
|
||||
:doc:`RPP <rpp:index>`,2.0.0,1.9.10,1.9.1
|
||||
,,,
|
||||
COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
|
||||
:doc:`RCCL <rccl:index>`,2.26.6,2.22.3,2.21.5
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.0.0,2.0.1,N/A
|
||||
,,,
|
||||
MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
|
||||
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
|
||||
:doc:`hipBLAS <hipblas:index>`,3.0.0,2.4.0,2.3.0
|
||||
:doc:`hipBLASLt <hipblaslt:index>`,1.0.0,0.12.1,0.10.0
|
||||
:doc:`hipFFT <hipfft:index>`,1.0.20,1.0.18,1.0.17
|
||||
:doc:`hipfort <hipfort:index>`,0.7.0,0.6.0,0.5.0
|
||||
:doc:`hipRAND <hiprand:index>`,3.0.0,2.12.0,2.11.0
|
||||
:doc:`hipSOLVER <hipsolver:index>`,3.0.0,2.4.0,2.3.0
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.0.1,3.2.0,3.1.2
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.4,0.2.3,0.2.2
|
||||
:doc:`rocALUTION <rocalution:index>`,4.0.0,3.2.3,3.2.1
|
||||
:doc:`rocBLAS <rocblas:index>`,5.0.0,4.4.1,4.3.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.34,1.0.32,1.0.31
|
||||
:doc:`rocRAND <rocrand:index>`,4.0.0,3.3.0,3.2.0
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.30.0,3.28.2,3.27.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.0.2,3.4.0,3.3.0
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.0.0,1.7.0,1.6.0
|
||||
:doc:`Tensile <tensile:src/index>`,4.44.0,4.43.0,4.42.0
|
||||
,,,
|
||||
PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
|
||||
:doc:`hipCUB <hipcub:index>`,4.0.0,3.4.0,3.3.0
|
||||
:doc:`hipTensor <hiptensor:index>`,2.0.0,1.5.0,1.4.0
|
||||
:doc:`rocPRIM <rocprim:index>`,4.0.0,3.4.1,3.3.0
|
||||
:doc:`rocThrust <rocthrust:index>`,4.0.0,3.3.0,3.3.0
|
||||
,,,
|
||||
SUPPORT LIBS,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.0.51830,6.4.43483,6.3.42131
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.0,6.4.3,6.3.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
|
||||
,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix:,,
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.0.0,25.5.1,24.7.1
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.1.0,0.3.0,0.3.0
|
||||
:doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
|
||||
:doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.7.0,7.4.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.1.0,1.1.0
|
||||
,,,
|
||||
PERFORMANCE TOOLS,,,
|
||||
:doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,1.4.0,1.4.0
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.2.3,3.1.1,3.0.0
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.1.0,1.0.2,0.1.0
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70000,2.0.60403,2.0.60300
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,0.6.0,0.5.0
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70000,4.1.60403,4.1.60300
|
||||
,,,
|
||||
DEVELOPMENT TOOLS,,,
|
||||
:doc:`HIPIFY <hipify:index>`,20.0.0,19.0.0,18.0.0.24455
|
||||
:doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0
|
||||
:doc:`ROCdbgapi <rocdbgapi:index>`,0.77.3,0.77.2,0.77.0
|
||||
:doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,15.2.0,15.2.0
|
||||
`rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.4.0,0.4.0
|
||||
:doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.0.4,2.0.3
|
||||
,,,
|
||||
COMPILERS,.. _compilers-support-compatibility-matrix:,,
|
||||
:doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1
|
||||
`Flang <https://github.com/ROCm/flang>`_,20.0.0.25314,19.0.0.25224,18.0.0.24455
|
||||
:doc:`llvm-project <llvm-project:index>`,20.0.0.25314,19.0.0.25224,18.0.0.24491
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.0.25314,19.0.0.25224,18.0.0.24491
|
||||
,,,
|
||||
RUNTIMES,.. _runtime-support-compatibility-matrix:,,
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.0.51830,6.4.43484,6.3.42131
|
||||
:doc:`HIP <hip:index>`,7.0.51830,6.4.43484,6.3.42131
|
||||
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
|
||||
:doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.15.0,1.14.0
|
||||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
.. [#rhel-700] RHEL 8.10 is only supported on AMD Instinct MI300X, MI300A, MI250X, MI250, MI210, and MI100 GPUs.
|
||||
.. [#ol-700-mi300x] **For ROCm 7.0** - Oracle Linux 9 is supported only on AMD Instinct MI300X, MI350X, and MI355X. Oracle Linux 8 is only supported on AMD Instinct MI300X.
|
||||
.. [#ol-mi300x] **Prior ROCm 7.0** - Oracle Linux is supported only on AMD Instinct MI300X.
|
||||
.. [#sles-db-700] SLES 15 SP7 and Debian 12 are only supported on AMD Instinct MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
|
||||
.. [#az-mi300x] Starting ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710.
|
||||
.. [#rl-700] Rocky Linux 9 is only supported on AMD Instinct MI300X and MI300A GPUs.
|
||||
.. [#single-node] **Prior to ROCm 7.0.0** - Debian 12 is supported only on AMD Instinct MI300X for single-node functionality.
|
||||
.. [#az-mi300x] Starting from ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710.
|
||||
.. [#RDNA-OS] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
|
||||
.. [#7700XT-OS] Radeon RX 7700 XT (gfx1101) is supported only on Ubuntu 24.04.2 and RHEL 9.6.
|
||||
.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
||||
.. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
|
||||
|
||||
|
||||
.. _OS-kernel-versions:
|
||||
|
||||
Operating systems, kernel and Glibc versions
|
||||
*********************************************
|
||||
|
||||
Use this lookup table to confirm which operating system and kernel versions are supported with ROCm.
|
||||
|
||||
.. csv-table::
|
||||
:header: "OS", "Version", "Kernel", "Glibc"
|
||||
:widths: 40, 20, 30, 20
|
||||
:stub-columns: 1
|
||||
|
||||
`Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.3, "6.8 [GA], 6.14 [HWE]", 2.39
|
||||
,,
|
||||
`Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.2, "6.8 [GA], 6.11 [HWE]", 2.39
|
||||
,,
|
||||
`Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 [GA], 6.8 [HWE]", 2.35
|
||||
,,
|
||||
`Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.6, 5.14.0-570, 2.34
|
||||
,9.5, 5.14+, 2.34
|
||||
,9.4, 5.14.0-427, 2.34
|
||||
,,
|
||||
`Red Hat Enterprise Linux (RHEL 8) <https://access.redhat.com/articles/3078#RHEL8>`_, 8.10, 4.18.0-553, 2.28
|
||||
,,
|
||||
`SUSE Linux Enterprise Server (SLES) <https://www.suse.com/support/kb/doc/?id=000019587#SLE15SP4>`_, 15 SP7, 6.40-150700.51, 2.38
|
||||
,15 SP6, "6.5.0+, 6.4.0", 2.38
|
||||
,15 SP5, 5.14.21, 2.31
|
||||
,,
|
||||
`Rocky Linux <https://wiki.rockylinux.org/rocky/version/>`_, 9, 5.14.0-570, 2.34
|
||||
,,
|
||||
`Oracle Linux <https://blogs.oracle.com/scoter/post/oracle-linux-and-unbreakable-enterprise-kernel-uek-releases>`_, 9, 6.12.0 (UEK), 2.34
|
||||
,8, 5.15.0 (UEK), 2.28
|
||||
,,
|
||||
`Debian <https://www.debian.org/download>`_,12, 6.1.0, 2.36
|
||||
,,
|
||||
`Azure Linux <https://techcommunity.microsoft.com/blog/linuxandopensourceblog/azure-linux-3-0-now-in-preview-on-azure-kubernetes-service-v1-31/4287229>`_,3.0, 6.6.92, 2.38
|
||||
,,
|
||||
|
||||
.. note::
|
||||
|
||||
* See `Red Hat Enterprise Linux Release Dates <https://access.redhat.com/articles/3078>`_ to learn about the specific kernel versions supported on Red Hat Enterprise Linux (RHEL).
|
||||
* See `List of SUSE Linux Enterprise Server kernel <https://www.suse.com/support/kb/doc/?id=000019587>`_ to learn about the specific kernel version supported on SUSE Linux Enterprise Server (SLES).
|
||||
..
|
||||
Footnotes and ref anchors in below historical tables should be appended with "-past-60", to differentiate from the
|
||||
footnote references in the above, latest, compatibility matrix. It also allows to easily find & replace.
|
||||
An easy way to work is to download the historical.CSV file, and update open it in excel. Then when content is ready,
|
||||
delete the columns you don't need, to build the current compatibility matrix to use in above table. Find & replace all
|
||||
instances of "-past-60" to make it ready for above table.
|
||||
|
||||
|
||||
.. _past-rocm-compatibility-matrix:
|
||||
|
||||
Past versions of ROCm compatibility matrix
|
||||
***************************************************
|
||||
|
||||
Expand for full historical view of:
|
||||
|
||||
.. dropdown:: ROCm 6.0 - Present
|
||||
|
||||
You can `download the entire .csv <../downloads/compatibility-matrix-historical-6.0.csv>`_ for offline reference.
|
||||
|
||||
.. csv-table::
|
||||
:file: compatibility-matrix-historical-6.0.csv
|
||||
:header-rows: 1
|
||||
:stub-columns: 1
|
||||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
.. [#ol-700-mi300x-past-60] **For ROCm 7.0.0** - Oracle Linux 9 is supported only on AMD Instinct MI300X, MI350X, and MI355X. Oracle Linux 8 is only supported on AMD Instinct MI300X.
|
||||
.. [#mi300x-past-60] **Prior to ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X.
|
||||
.. [#single-node-past-60] **Prior to ROCm 7.0.0 ** - Debian 12 is supported only on AMD Instinct MI300X for single-node functionality.
|
||||
.. [#az-mi300x-past-60] Starting from ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710.
|
||||
.. [#az-mi300x-630-past-60] **Prior ROCm 6.4.0**- Azure Linux 3.0 is supported only on AMD Instinct MI300X.
|
||||
.. [#RDNA-OS-past-60] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
|
||||
.. [#7700XT-OS-past-60] Radeon RX 7700 XT (gfx1101) is supported only on Ubuntu 24.04.2 and RHEL 9.6.
|
||||
.. [#mi300_624-past-60] **For ROCm 6.2.4** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
|
||||
.. [#mi300_622-past-60] **For ROCm 6.2.2** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
|
||||
.. [#mi300_621-past-60] **For ROCm 6.2.1** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
|
||||
.. [#mi300_620-past-60] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
|
||||
.. [#mi300_612-past-60] **For ROCm 6.1.2** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4 and Oracle Linux.
|
||||
.. [#mi300_611-past-60] **For ROCm 6.1.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4 and Oracle Linux.
|
||||
.. [#mi300_610-past-60] **For ROCm 6.1.0** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
|
||||
.. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
|
||||
.. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
|
||||
.. [#verl_compat] verl is only supported on ROCm 6.2.0.
|
||||
.. [#stanford-megatron-lm_compat] Stanford Megatron-LM is only supported on ROCm 6.3.0.
|
||||
.. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
|
||||
.. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
|
||||
.. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
|
||||
.. [#ray_compat] Ray is only supported on ROCm 6.4.1.
|
||||
.. [#llama-cpp_compat] llama.cpp is only supported on ROCm 6.4.0.
|
||||
.. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
||||
.. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
|
||||
|
||||
@@ -1,255 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Deep Graph Library (DGL) compatibility
|
||||
:keywords: GPU, DGL compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
********************************************************************************
|
||||
DGL compatibility
|
||||
********************************************************************************
|
||||
|
||||
Deep Graph Library `(DGL) <https://www.dgl.ai/>`_ is an easy-to-use, high-performance and scalable
|
||||
Python package for deep learning on graphs. DGL is framework agnostic, meaning
|
||||
if a deep graph model is a component in an end-to-end application, the rest of
|
||||
the logic is implemented using PyTorch.
|
||||
|
||||
* ROCm support for DGL is hosted in the `https://github.com/ROCm/dgl <https://github.com/ROCm/dgl>`_ repository.
|
||||
* Due to independent compatibility considerations, this location differs from the `https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`_ upstream repository.
|
||||
* Use the prebuilt :ref:`Docker images <dgl-docker-compat>` with DGL, PyTorch, and ROCm preinstalled.
|
||||
* See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>`
|
||||
to install and get started.
|
||||
|
||||
|
||||
Supported devices
|
||||
================================================================================
|
||||
|
||||
- **Officially Supported**: TF32 with AMD Instinct MI300X (through hipblaslt)
|
||||
- **Partially Supported**: TF32 with AMD Instinct MI250X
|
||||
|
||||
|
||||
.. _dgl-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
DGL can be used for Graph Learning, and building popular graph models like
|
||||
GAT, GCN and GraphSage. Using these we can support a variety of use-cases such as:
|
||||
|
||||
- Recommender systems
|
||||
- Network Optimization and Analysis
|
||||
- 1D (Temporal) and 2D (Image) Classification
|
||||
- Drug Discovery
|
||||
|
||||
Multiple use cases of DGL have been tested and verified.
|
||||
However, a recommended example follows a drug discovery pipeline using the ``SE3Transformer``.
|
||||
Refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_,
|
||||
where you can search for DGL examples and best practices to optimize your training workflows on AMD GPUs.
|
||||
|
||||
Coverage includes:
|
||||
|
||||
- Single-GPU training/inference
|
||||
- Multi-GPU training
|
||||
|
||||
|
||||
.. _dgl-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl>`_
|
||||
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_.
|
||||
Click the |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table:: DGL Docker image components
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker
|
||||
- DGL
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-8ce2c3bcfaa137ab94a75f9e2ea711894748980f57417739138402a542dd5564"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
|
||||
- `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-cf1683283b8eeda867b690229c8091c5bbf1edb9f52e8fb3da437c49a612ebe4"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
|
||||
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-4834f178c3614e2d09e89e32041db8984c456d45dfd20286e377ca8635686554"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
|
||||
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 22.04
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-88740a2c8ab4084b42b10c3c6ba984cab33dd3a044f479c6d7618e2b2cb05e69"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
|
||||
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
|
||||
- 22.04
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
|
||||
|
||||
Key ROCm libraries for DGL
|
||||
================================================================================
|
||||
|
||||
DGL on ROCm depends on specific libraries that affect its features and performance.
|
||||
Using the DGL Docker container or building it with the provided docker file or a ROCm base image is recommended.
|
||||
If you prefer to build it yourself, ensure the following dependencies are installed:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ROCm library
|
||||
- Version
|
||||
- Purpose
|
||||
* - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
|
||||
- :version-ref:`"Composable Kernel" rocm_version`
|
||||
- Enables faster execution of core operations like matrix multiplication
|
||||
(GEMM), convolutions and transformations.
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- hipBLASLt is an extension of the hipBLAS library, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
|
||||
- :version-ref:`hipCUB rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
|
||||
- :version-ref:`hipFFT rocm_version`
|
||||
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
|
||||
* - `hipRAND <https://github.com/ROCm/hipRAND>`_
|
||||
- :version-ref:`hipRAND rocm_version`
|
||||
- Provides fast random number generation for GPUs.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
|
||||
- :version-ref:`hipSOLVER rocm_version`
|
||||
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
|
||||
singular value decompositions (SVD).
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
|
||||
- :version-ref:`hipSPARSE rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
|
||||
- :version-ref:`hipSPARSELt rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
* - `hipTensor <https://github.com/ROCm/hipTensor>`_
|
||||
- :version-ref:`hipTensor rocm_version`
|
||||
- Optimizes for high-performance tensor operations, such as contractions.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
|
||||
- :version-ref:`MIOpen rocm_version`
|
||||
- Optimizes deep learning primitives such as convolutions, pooling,
|
||||
normalization, and activation functions.
|
||||
* - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
|
||||
- :version-ref:`MIGraphX rocm_version`
|
||||
- Adds graph-level optimizations, ONNX models and mixed precision support
|
||||
and enable Ahead-of-Time (AOT) Compilation.
|
||||
* - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
|
||||
- :version-ref:`MIVisionX rocm_version`
|
||||
- Optimizes acceleration for computer vision and AI workloads like
|
||||
preprocessing, augmentation, and inferencing.
|
||||
* - `rocAL <https://github.com/ROCm/rocAL>`_
|
||||
- :version-ref:`rocAL rocm_version`
|
||||
- Accelerates the data pipeline by offloading intensive preprocessing and
|
||||
augmentation tasks. rocAL is part of MIVisionX.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`_
|
||||
- :version-ref:`RCCL rocm_version`
|
||||
- Optimizes for multi-GPU communication for operations like AllReduce and
|
||||
Broadcast.
|
||||
* - `rocDecode <https://github.com/ROCm/rocDecode>`_
|
||||
- :version-ref:`rocDecode rocm_version`
|
||||
- Provides hardware-accelerated data decoding capabilities, particularly
|
||||
for image, video, and other dataset formats.
|
||||
* - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
|
||||
- :version-ref:`rocJPEG rocm_version`
|
||||
- Provides hardware-accelerated JPEG image decoding and encoding.
|
||||
* - `RPP <https://github.com/ROCm/RPP>`_
|
||||
- :version-ref:`RPP rocm_version`
|
||||
- Speeds up data augmentation, transformation, and other preprocessing steps.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||
- :version-ref:`rocThrust rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
|
||||
- :version-ref:`rocWMMA rocm_version`
|
||||
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
|
||||
multiplication (GEMM) and accumulation operations with mixed precision
|
||||
support.
|
||||
|
||||
|
||||
Supported features
|
||||
================================================================================
|
||||
|
||||
Many functions and methods available in DGL Upstream are also supported in DGL ROCm.
|
||||
Instead of listing them all, support is grouped into the following categories to provide a general overview.
|
||||
|
||||
* DGL Base
|
||||
* DGL Backend
|
||||
* DGL Data
|
||||
* DGL Dataloading
|
||||
* DGL DGLGraph
|
||||
* DGL Function
|
||||
* DGL Ops
|
||||
* DGL Sampling
|
||||
* DGL Transforms
|
||||
* DGL Utils
|
||||
* DGL Distributed
|
||||
* DGL Geometry
|
||||
* DGL Mpops
|
||||
* DGL NN
|
||||
* DGL Optim
|
||||
* DGL Sparse
|
||||
|
||||
|
||||
Unsupported features
|
||||
================================================================================
|
||||
|
||||
* Graphbolt
|
||||
* Partial TF32 Support (MI250x only)
|
||||
* Kineto/ ROCTracer integration
|
||||
|
||||
|
||||
Unsupported functions
|
||||
================================================================================
|
||||
|
||||
* ``more_nnz``
|
||||
* ``format``
|
||||
* ``multiprocess_sparse_adam_state_dict``
|
||||
* ``record_stream_ndarray``
|
||||
* ``half_spmm``
|
||||
* ``segment_mm``
|
||||
* ``gather_mm_idx_b``
|
||||
* ``pgexplainer``
|
||||
* ``sample_labors_prob``
|
||||
* ``sample_labors_noprob``
|
||||
@@ -1,363 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: JAX compatibility
|
||||
:keywords: GPU, JAX compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
*******************************************************************************
|
||||
JAX compatibility
|
||||
*******************************************************************************
|
||||
|
||||
JAX provides a NumPy-like API, which combines automatic differentiation and the
|
||||
Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine
|
||||
learning at scale.
|
||||
|
||||
JAX uses composable transformations of Python and NumPy through just-in-time
|
||||
(JIT) compilation, automatic vectorization, and parallelization. To learn about
|
||||
JAX, including profiling and optimizations, see the official `JAX documentation
|
||||
<https://jax.readthedocs.io/en/latest/notebooks/quickstart.html>`_.
|
||||
|
||||
ROCm support for JAX is upstreamed, and users can build the official source code
|
||||
with ROCm support:
|
||||
|
||||
- ROCm JAX release:
|
||||
|
||||
- Offers AMD-validated and community :ref:`Docker images <jax-docker-compat>`
|
||||
with ROCm and JAX preinstalled.
|
||||
|
||||
- ROCm JAX repository: `ROCm/rocm-jax <https://github.com/ROCm/rocm-jax>`_
|
||||
|
||||
- See the :doc:`ROCm JAX installation guide <rocm-install-on-linux:install/3rd-party/jax-install>`
|
||||
to get started.
|
||||
|
||||
- Official JAX release:
|
||||
|
||||
- Official JAX repository: `jax-ml/jax <https://github.com/jax-ml/jax>`_
|
||||
|
||||
- See the `AMD GPU (Linux) installation section
|
||||
<https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`_ in
|
||||
the JAX documentation.
|
||||
|
||||
.. note::
|
||||
|
||||
AMD releases official `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
|
||||
quarterly alongside new ROCm releases. These images undergo full AMD testing.
|
||||
`Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
|
||||
follow upstream JAX releases and use the latest available ROCm version.
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* The `nanoGPT in JAX <https://rocm.blogs.amd.com/artificial-intelligence/nanoGPT-JAX/README.html>`_
|
||||
blog explores the implementation and training of a Generative Pre-trained
|
||||
Transformer (GPT) model in JAX, inspired by Andrej Karpathy’s JAX-based
|
||||
nanoGPT. Comparing how essential GPT components—such as self-attention
|
||||
mechanisms and optimizers—are realized in JAX and JAX, also highlights
|
||||
JAX’s unique features.
|
||||
|
||||
* The `Optimize GPT Training: Enabling Mixed Precision Training in JAX using
|
||||
ROCm on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/jax-mixed-precision/README.html>`_
|
||||
blog post provides a comprehensive guide on enhancing the training efficiency
|
||||
of GPT models by implementing mixed precision techniques in JAX, specifically
|
||||
tailored for AMD GPUs utilizing the ROCm platform.
|
||||
|
||||
* The `Supercharging JAX with Triton Kernels on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/jax-triton/README.html>`_
|
||||
blog demonstrates how to develop a custom fused dropout-activation kernel for
|
||||
matrices using Triton, integrate it with JAX, and benchmark its performance
|
||||
using ROCm.
|
||||
|
||||
* The `Distributed fine-tuning with JAX on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/distributed-sft-jax/README.html>`_
|
||||
outlines the process of fine-tuning a Bidirectional Encoder Representations
|
||||
from Transformers (BERT)-based large language model (LLM) using JAX for a text
|
||||
classification task. The blog post discuss techniques for parallelizing the
|
||||
fine-tuning across multiple AMD GPUs and assess the model's performance on a
|
||||
holdout dataset. During the fine-tuning, a BERT-base-cased transformer model
|
||||
and the General Language Understanding Evaluation (GLUE) benchmark dataset was
|
||||
used on a multi-GPU setup.
|
||||
|
||||
* The `MI300X workload optimization guide <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html>`_
|
||||
provides detailed guidance on optimizing workloads for the AMD Instinct MI300X
|
||||
accelerator using ROCm. The page is aimed at helping users achieve optimal
|
||||
performance for deep learning and other high-performance computing tasks on
|
||||
the MI300X GPU.
|
||||
|
||||
For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.blogs.amd.com/blog/tag/jax.html>`_.
|
||||
|
||||
.. _jax-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes ready-made `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories represent the latest JAX version from the official Docker Hub and are validated for
|
||||
`ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`_. Click the |docker-icon|
|
||||
icon to view the image on Docker Hub.
|
||||
|
||||
.. list-table:: JAX Docker image components
|
||||
:header-rows: 1
|
||||
|
||||
* - Docker image
|
||||
- JAX
|
||||
- Linux
|
||||
- Python
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.2-jax0.4.35-py3.12/images/sha256-8918fa806a172c1a10eb2f57131eb31b5d7c8fa1656b8729fe7d3d736112de83"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
|
||||
|
||||
- `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
|
||||
- Ubuntu 24.04
|
||||
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.2-jax0.4.35-py3.10/images/sha256-a394be13c67b7fc602216abee51233afd4b6cb7adaa57ca97e688fba82f9ad79"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
|
||||
|
||||
- `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
|
||||
- Ubuntu 22.04
|
||||
- `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
|
||||
|
||||
AMD publishes `Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories are tested for `ROCm 6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`_.
|
||||
|
||||
.. list-table:: JAX community Docker image components
|
||||
:header-rows: 1
|
||||
|
||||
* - Docker image
|
||||
- JAX
|
||||
- Linux
|
||||
- Python
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/jax-community/rocm6.3.2-jax0.5.0-py3.12.8/images/sha256-25dfaa0183e274bd0a3554a309af3249c6f16a1793226cb5373f418e39d3146a"><i class="fab fa-docker fa-lg"></i> rocm/jax-community</a>
|
||||
|
||||
- `0.5.0 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.5.0>`_
|
||||
- Ubuntu 22.04
|
||||
- `3.12.8 <https://www.python.org/downloads/release/python-3128/>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/jax-community/rocm6.3.2-jax0.5.0-py3.11.11/images/sha256-ff9baeca9067d13e6c279c911e5a9e5beed0817d24fafd424367cc3d5bd381d7"><i class="fab fa-docker fa-lg"></i> rocm/jax-community</a>
|
||||
|
||||
- `0.5.0 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.5.0>`_
|
||||
- Ubuntu 22.04
|
||||
- `3.11.11 <https://www.python.org/downloads/release/python-31111/>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/jax-community/rocm6.3.2-jax0.5.0-py3.10.16/images/sha256-8bab484be1713655f74da51a191ed824bb9d03db1104fd63530a1ac3c37cf7b1"><i class="fab fa-docker fa-lg"></i> rocm/jax-community</a>
|
||||
|
||||
- `0.5.0 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.5.0>`_
|
||||
- Ubuntu 22.04
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
|
||||
.. _key_rocm_libraries:
|
||||
|
||||
Key ROCm libraries for JAX
|
||||
================================================================================
|
||||
|
||||
The following ROCm libraries represent potential targets that could be utilized
|
||||
by JAX on ROCm for various computational tasks. The actual libraries used will
|
||||
depend on the specific implementation and operations performed.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ROCm library
|
||||
- Version
|
||||
- Purpose
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- hipBLASLt is an extension of hipBLAS, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
|
||||
- :version-ref:`hipCUB rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
|
||||
- :version-ref:`hipFFT rocm_version`
|
||||
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
|
||||
* - `hipRAND <https://github.com/ROCm/hipRAND>`_
|
||||
- :version-ref:`hipRAND rocm_version`
|
||||
- Provides fast random number generation for GPUs.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
|
||||
- :version-ref:`hipSOLVER rocm_version`
|
||||
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
|
||||
singular value decompositions (SVD).
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
|
||||
- :version-ref:`hipSPARSE rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
|
||||
- :version-ref:`hipSPARSELt rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
|
||||
- :version-ref:`MIOpen rocm_version`
|
||||
- Optimized for deep learning primitives such as convolutions, pooling,
|
||||
normalization, and activation functions.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`_
|
||||
- :version-ref:`RCCL rocm_version`
|
||||
- Optimized for multi-GPU communication for operations like all-reduce,
|
||||
broadcast, and scatter.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||
- :version-ref:`rocThrust rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
|
||||
.. note::
|
||||
|
||||
This table shows ROCm libraries that could potentially be utilized by JAX. Not
|
||||
all libraries may be used in every configuration, and the actual library usage
|
||||
will depend on the specific operations and implementation details.
|
||||
|
||||
Supported data types and modules
|
||||
===============================================================================
|
||||
|
||||
The following tables lists the supported public JAX API data types and modules.
|
||||
|
||||
Supported data types
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
ROCm supports all the JAX data types of `jax.dtypes <https://docs.jax.dev/en/latest/jax.dtypes.html>`_
|
||||
module, `jax.numpy.dtype <https://docs.jax.dev/en/latest/_autosummary/jax.numpy.dtype.html>`_
|
||||
and `default_dtype <https://docs.jax.dev/en/latest/default_dtypes.html>`_ .
|
||||
The ROCm supported data types in JAX are collected in the following table.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
|
||||
* - ``bfloat16``
|
||||
- 16-bit bfloat (brain floating point).
|
||||
|
||||
* - ``bool``
|
||||
- Boolean.
|
||||
|
||||
* - ``complex128``
|
||||
- 128-bit complex.
|
||||
|
||||
* - ``complex64``
|
||||
- 64-bit complex.
|
||||
|
||||
* - ``float16``
|
||||
- 16-bit (half precision) floating-point.
|
||||
|
||||
* - ``float32``
|
||||
- 32-bit (single precision) floating-point.
|
||||
|
||||
* - ``float64``
|
||||
- 64-bit (double precision) floating-point.
|
||||
|
||||
* - ``half``
|
||||
- 16-bit (half precision) floating-point.
|
||||
|
||||
* - ``int16``
|
||||
- Signed 16-bit integer.
|
||||
|
||||
* - ``int32``
|
||||
- Signed 32-bit integer.
|
||||
|
||||
* - ``int64``
|
||||
- Signed 64-bit integer.
|
||||
|
||||
* - ``int8``
|
||||
- Signed 8-bit integer.
|
||||
|
||||
* - ``uint16``
|
||||
- Unsigned 16-bit (word) integer.
|
||||
|
||||
* - ``uint32``
|
||||
- Unsigned 32-bit (dword) integer.
|
||||
|
||||
* - ``uint64``
|
||||
- Unsigned 64-bit (qword) integer.
|
||||
|
||||
* - ``uint8``
|
||||
- Unsigned 8-bit (byte) integer.
|
||||
|
||||
.. note::
|
||||
|
||||
JAX data type support is effected by the :ref:`key_rocm_libraries` and it's
|
||||
collected on :doc:`ROCm data types and precision support <rocm:reference/precision-support>`
|
||||
page.
|
||||
|
||||
Supported modules
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
For a complete and up-to-date list of JAX public modules (for example, ``jax.numpy``,
|
||||
``jax.scipy``, ``jax.lax``), their descriptions, and usage, please refer directly to the
|
||||
`official JAX API documentation <https://jax.readthedocs.io/en/latest/jax.html>`_.
|
||||
|
||||
.. note::
|
||||
|
||||
Since version 0.1.56, JAX has full support for ROCm, and the
|
||||
:ref:`Known issues and important notes <jax_comp_known_issues>` section
|
||||
contains details about limitations specific to the ROCm backend. The list of
|
||||
JAX API modules are maintained by the JAX project and is subject to change.
|
||||
Refer to the official Jax documentation for the most up-to-date information.
|
||||
|
||||
Key features and enhancements for ROCm 7.0
|
||||
===============================================================================
|
||||
|
||||
- Upgraded XLA backend: Integrates a newer XLA version, enabling better
|
||||
optimizations, broader operator support, and potential performance gains.
|
||||
|
||||
- RNN support: Native RNN support (including LSTMs via ``jax.experimental.rnn``)
|
||||
now available on ROCm, aiding sequence model development.
|
||||
|
||||
- Comprehensive linear algebra capabilities: Offers robust ``jax.linalg``
|
||||
operations, essential for scientific and machine learning tasks.
|
||||
|
||||
- Expanded AMD GPU architecture support: Provides ongoing support for gfx1101
|
||||
GPUs and introduces support for gfx950 and gfx12xx GPUs.
|
||||
|
||||
- Mixed FP8 precision support: Enables ``lax.dot_general`` operations with mixed FP8
|
||||
types, offering pathways for memory and compute efficiency.
|
||||
|
||||
- Streamlined PyPi packaging: Provides reliable PyPi wheels for JAX on ROCm,
|
||||
simplifying the installation process.
|
||||
|
||||
- Pallas experimental kernel development: Continued Pallas framework
|
||||
enhancements for custom GPU kernels, including new intrinsics (specific
|
||||
kernel behaviors under review).
|
||||
|
||||
- Improved build system and CI: Enhanced ROCm build system and CI for greater
|
||||
reliability and maintainability.
|
||||
|
||||
- Enhanced distributed computing setup: Improved JAX setup in multi-GPU
|
||||
distributed environments.
|
||||
|
||||
.. _jax_comp_known_issues:
|
||||
|
||||
Known issues and notes for ROCm 7.0
|
||||
===============================================================================
|
||||
|
||||
- ``nn.dot_product_attention``: Certain configurations of ``jax.nn.dot_product_attention``
|
||||
may cause segmentation faults, though the majority of use cases work correctly.
|
||||
|
||||
- SVD with dynamic shapes: SVD on inputs with dynamic/symbolic shapes might result in an error.
|
||||
SVD with static shapes is unaffected.
|
||||
|
||||
- QR decomposition with symbolic shapes: QR decomposition operations may fail when using
|
||||
symbolic/dynamic shapes in shape polymorphic contexts.
|
||||
|
||||
- Pallas kernels: Specific advanced Pallas kernels may exhibit variations in
|
||||
numerical output or resource usage. These are actively reviewed as part of
|
||||
Pallas's experimental development.
|
||||
@@ -1,156 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: llama.cpp deep learning framework compatibility
|
||||
:keywords: GPU, GGML, llama.cpp compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
********************************************************************************
|
||||
llama.cpp compatibility
|
||||
********************************************************************************
|
||||
|
||||
`llama.cpp <https://github.com/ggml-org/llama.cpp>`__ is an open-source framework
|
||||
for Large Language Model (LLM) inference that runs on both central processing units
|
||||
(CPUs) and graphics processing units (GPUs). It is written in plain C/C++, providing
|
||||
a simple, dependency-free setup.
|
||||
|
||||
The framework supports multiple quantization options, from 1.5-bit to 8-bit integers,
|
||||
to speed up inference and reduce memory usage. Originally built as a CPU-first library,
|
||||
llama.cpp is easy to integrate with other programming environments and is widely
|
||||
adopted across diverse platforms, including consumer devices.
|
||||
|
||||
ROCm support for llama.cpp is upstreamed, and you can build the official source code
|
||||
with ROCm support:
|
||||
|
||||
- ROCm support for llama.cpp is hosted in the official `https://github.com/ROCm/llama.cpp
|
||||
<https://github.com/ROCm/llama.cpp>`_ repository.
|
||||
|
||||
- Due to independent compatibility considerations, this location differs from the
|
||||
`https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`_ upstream repository.
|
||||
|
||||
- To install llama.cpp, use the prebuilt :ref:`Docker image <llama-cpp-docker-compat>`,
|
||||
which includes ROCm, llama.cpp, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
|
||||
to install and get started.
|
||||
|
||||
- See the `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip>`__
|
||||
in the upstream llama.cpp documentation.
|
||||
|
||||
.. note::
|
||||
|
||||
llama.cpp is supported on ROCm 6.4.0.
|
||||
|
||||
Supported devices
|
||||
================================================================================
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X, MI210
|
||||
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
llama.cpp can be applied in a variety of scenarios, particularly when you need to meet one or more of the following requirements:
|
||||
|
||||
- Plain C/C++ implementation with no external dependencies
|
||||
- Support for 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory usage
|
||||
- Custom HIP (Heterogeneous-compute Interface for Portability) kernels for running large language models (LLMs) on AMD GPUs (graphics processing units)
|
||||
- CPU (central processing unit) + GPU (graphics processing unit) hybrid inference for partially accelerating models larger than the total available VRAM (video random-access memory)
|
||||
|
||||
llama.cpp is also used in a range of real-world applications, including:
|
||||
|
||||
- Games such as `Lucy's Labyrinth <https://github.com/MorganRO8/Lucys_Labyrinth>`__:
|
||||
A simple maze game where AI-controlled agents attempt to trick the player.
|
||||
- Tools such as `Styled Lines <https://marketplace.unity.com/packages/tools/ai-ml-integration/style-text-webgl-ios-stand-alone-llm-llama-cpp-wrapper-292902>`__:
|
||||
A proprietary, asynchronous inference wrapper for Unity3D game development, including pre-built mobile and web platform wrappers and a model example.
|
||||
- Various other AI applications use llama.cpp as their inference engine;
|
||||
for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
|
||||
|
||||
For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||
where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
|
||||
|
||||
- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__,
|
||||
blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``,
|
||||
server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for
|
||||
AMD Instinct GPUs within the ROCm ecosystem.
|
||||
|
||||
.. _llama-cpp-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. important::
|
||||
|
||||
Tag endings of ``_full``, ``_server``, and ``_light`` serve different purposes for entrypoints as follows:
|
||||
|
||||
- Full: This image includes both the main executable file and the tools to convert ``LLaMA`` models into ``ggml`` and convert into 4-bit quantization.
|
||||
- Server: This image only includes the server executable file.
|
||||
- Light: This image only includes the main executable file.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Full Docker
|
||||
- Server Docker
|
||||
- Light Docker
|
||||
- llama.cpp
|
||||
- Ubuntu
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_full/images/sha256-f78f6c81ab2f8e957469415fe2370a1334fe969c381d1fe46050c85effaee9d5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_server/images/sha256-275ad9e18f292c26a00a2de840c37917e98737a88a3520bdc35fd3fc5c9a6a9b"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_light/images/sha256-cc324e6faeedf0e400011f07b49d2dc41a16bae257b2b7befa0f4e2e97231320"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- `b5997 <https://github.com/ROCm/llama.cpp/tree/release/b5997>`__
|
||||
- 24.04
|
||||
|
||||
Key ROCm libraries for llama.cpp
|
||||
================================================================================
|
||||
|
||||
llama.cpp functionality on ROCm is determined by its underlying library
|
||||
dependencies. These ROCm components affect the capabilities, performance, and
|
||||
feature set available to developers.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ROCm library
|
||||
- Version
|
||||
- Purpose
|
||||
- Usage
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
- Supports operations such as matrix multiplication, matrix-vector
|
||||
products, and tensor contractions. Utilized in both dense and batched
|
||||
linear algebra operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- hipBLASLt is an extension of the hipBLAS library, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
- By setting the flag ``ROCBLAS_USE_HIPBLASLT``, you can dispatch hipblasLt
|
||||
kernels where possible.
|
||||
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`__
|
||||
- :version-ref:`rocWMMA rocm_version`
|
||||
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
|
||||
multiplication (GEMM) and accumulation operations with mixed precision
|
||||
support.
|
||||
- Can be used to enhance the flash attention performance on AMD compute, by enabling
|
||||
the flag during compile time.
|
||||
@@ -1,93 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Megablocks compatibility
|
||||
:keywords: GPU, megablocks, compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
********************************************************************************
|
||||
Megablocks compatibility
|
||||
********************************************************************************
|
||||
|
||||
Megablocks is a light-weight library for mixture-of-experts (MoE) training.
|
||||
The core of the system is efficient "dropless-MoE" and standard MoE layers.
|
||||
Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_,
|
||||
where data and pipeline parallel training of MoEs is supported.
|
||||
|
||||
* ROCm support for Megablocks is hosted in the official `https://github.com/ROCm/megablocks <https://github.com/ROCm/megablocks>`_ repository.
|
||||
* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository.
|
||||
* Use the prebuilt :ref:`Docker image <megablocks-docker-compat>` with ROCm, PyTorch, and Megablocks preinstalled.
|
||||
* See the :doc:`ROCm Megablocks installation guide <rocm-install-on-linux:install/3rd-party/megablocks-install>` to install and get started.
|
||||
|
||||
.. note::
|
||||
|
||||
Megablocks is supported on ROCm 6.3.0.
|
||||
|
||||
Supported devices
|
||||
================================================================================
|
||||
|
||||
- **Officially Supported**: AMD Instinct MI300X
|
||||
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210X
|
||||
|
||||
Supported models and features
|
||||
================================================================================
|
||||
|
||||
This section summarizes the Megablocks features supported by ROCm.
|
||||
|
||||
* Distributed Pre-training
|
||||
* Activation Checkpointing and Recomputation
|
||||
* Distributed Optimizer
|
||||
* Mixture-of-Experts
|
||||
* dropless-Mixture-of-Experts
|
||||
|
||||
|
||||
.. _megablocks-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
The `ROCm Megablocks blog posts <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_
|
||||
guide how to leverage the ROCm platform for pre-training using the Megablocks framework.
|
||||
It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:
|
||||
|
||||
* Single-GPU pre-training
|
||||
* Multi-GPU pre-training
|
||||
|
||||
|
||||
.. _megablocks-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `ROCm Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`_
|
||||
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the latest Megatron-LM version from the official Docker Hub.
|
||||
The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Megablocks
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/megablocks/megablocks-0.7.0_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-372ff89b96599019b8f5f9db469c84add2529b713456781fa62eb9a148659ab4"><i class="fab fa-docker fa-lg"></i> rocm/megablocks</a>
|
||||
- `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
|
||||
- `0.7.0 <https://github.com/databricks/megablocks/releases/tag/v0.7.0>`_
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
|
||||
|
||||
@@ -1,605 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: PyTorch compatibility
|
||||
:keywords: GPU, PyTorch compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
********************************************************************************
|
||||
PyTorch compatibility
|
||||
********************************************************************************
|
||||
|
||||
`PyTorch <https://pytorch.org/>`__ is an open-source tensor library designed for
|
||||
deep learning. PyTorch on ROCm provides mixed-precision and large-scale training
|
||||
using `MIOpen <https://github.com/ROCm/MIOpen>`__ and
|
||||
`RCCL <https://github.com/ROCm/rccl>`__ libraries.
|
||||
|
||||
ROCm support for PyTorch is upstreamed into the official PyTorch repository. Due
|
||||
to independent compatibility considerations, this results in two distinct
|
||||
release cycles for PyTorch on ROCm:
|
||||
|
||||
- ROCm PyTorch release:
|
||||
|
||||
- Provides the latest version of ROCm but might not necessarily support the
|
||||
latest stable PyTorch version.
|
||||
|
||||
- Offers :ref:`Docker images <pytorch-docker-compat>` with ROCm and PyTorch
|
||||
preinstalled.
|
||||
|
||||
- ROCm PyTorch repository: `<https://github.com/ROCm/pytorch>`__
|
||||
|
||||
- See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
||||
to get started.
|
||||
|
||||
- Official PyTorch release:
|
||||
|
||||
- Provides the latest stable version of PyTorch but might not necessarily
|
||||
support the latest ROCm version.
|
||||
|
||||
- Official PyTorch repository: `<https://github.com/pytorch/pytorch>`__
|
||||
|
||||
- See the `Nightly and latest stable version installation guide <https://pytorch.org/get-started/locally/>`__
|
||||
or `Previous versions <https://pytorch.org/get-started/previous-versions/>`__
|
||||
to get started.
|
||||
|
||||
PyTorch includes tooling that generates HIP source code from the CUDA backend.
|
||||
This approach allows PyTorch to support ROCm without requiring manual code
|
||||
modifications. For more information, see :doc:`HIPIFY <hipify:index>`.
|
||||
|
||||
ROCm development is aligned with the stable release of PyTorch, while upstream
|
||||
PyTorch testing uses the stable release of ROCm to maintain consistency.
|
||||
|
||||
.. _pytorch-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* :doc:`Using ROCm for AI: training a model </how-to/rocm-for-ai/training/benchmark-docker/pytorch-training>`
|
||||
guides how to leverage the ROCm platform for training AI models. It covers the
|
||||
steps, tools, and best practices for optimizing training workflows on AMD GPUs
|
||||
using PyTorch features.
|
||||
|
||||
* :doc:`Single-GPU fine-tuning and inference </how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference>`
|
||||
describes and demonstrates how to use the ROCm platform for the fine-tuning
|
||||
and inference of machine learning models, particularly large language models
|
||||
(LLMs), on systems with a single GPU. This topic provides a detailed guide for
|
||||
setting up, optimizing, and executing fine-tuning and inference workflows in
|
||||
such environments.
|
||||
|
||||
* :doc:`Multi-GPU fine-tuning and inference optimization </how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference>`
|
||||
describes and demonstrates the fine-tuning and inference of machine learning
|
||||
models on systems with multiple GPUs.
|
||||
|
||||
* The :doc:`Instinct MI300X workload optimization guide </how-to/rocm-for-ai/inference-optimization/workload>`
|
||||
provides detailed guidance on optimizing workloads for the AMD Instinct MI300X
|
||||
accelerator using ROCm. This guide helps users achieve optimal performance for
|
||||
deep learning and other high-performance computing tasks on the MI300X
|
||||
accelerator.
|
||||
|
||||
* The :doc:`Inception with PyTorch documentation </conceptual/ai-pytorch-inception>`
|
||||
describes how PyTorch integrates with ROCm for AI workloads It outlines the
|
||||
use of PyTorch on the ROCm platform and focuses on efficiently leveraging AMD
|
||||
GPU hardware for training and inference tasks in AI applications.
|
||||
|
||||
For more use cases and recommendations, see `ROCm PyTorch blog posts <https://rocm.blogs.amd.com/blog/tag/pytorch.html>`__.
|
||||
|
||||
.. _pytorch-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories were tested on `ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table:: PyTorch Docker image components
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
- Apex
|
||||
- torchvision
|
||||
- TensorBoard
|
||||
- MAGMA
|
||||
- UCX
|
||||
- OMPI
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-6a287591500b4048a9556c1ecc92bc411fd3d552f6c8233bc399f18eb803e8d6"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`__
|
||||
- 24.04
|
||||
- `3.12 <https://www.python.org/downloads/release/python-31210/>`__
|
||||
- `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`__
|
||||
- `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`__
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||
- `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
|
||||
- `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.10_pytorch_release_2.6.0/images/sha256-06b967629ba6657709f04169832cd769a11e6b491e8b1394c361d42d7a0c8b43"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`__
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31017/>`__
|
||||
- `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`__
|
||||
- `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`__
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||
- `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
|
||||
- `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.5.1/images/sha256-62022414217ef6de33ac5b1341e57db8a48e8573fa2ace12d48aa5edd4b99ef0"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
|
||||
- 24.04
|
||||
- `3.12 <https://www.python.org/downloads/release/python-31210/>`__
|
||||
- `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
|
||||
- `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||
- `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.10.0>`__
|
||||
- `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.11_pytorch_release_2.5.1/images/sha256-469a7f74fc149aff31797e011ee41978f6a190adc69fa423b3c6a718a77bd985"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
|
||||
- 22.04
|
||||
- `3.11 <https://www.python.org/downloads/release/python-31113/>`__
|
||||
- `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
|
||||
- `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||
- `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
|
||||
- `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.10_pytorch_release_2.5.1/images/sha256-37f41a1cd94019688669a1b20d33ea74156e0c129ef6b8270076ef214a6a1a2c"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31017/>`__
|
||||
- `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
|
||||
- `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||
- `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
|
||||
- `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-60824ba83dc1b9d94164925af1f81c0235c105dd555091ec04c57e05177ead1b"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
|
||||
- 24.04
|
||||
- `3.12 <https://www.python.org/downloads/release/python-31210/>`__
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`__
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`__
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||
- `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
|
||||
- `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-fe944fe083312f901be6891ab4d3ffebf2eaf2cf4f5f0f435ef0b76ec714fabd"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31017/>`__
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`__
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`__
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||
- `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
|
||||
- `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.3.0/images/sha256-1d59251c47170c5b8960d1172a4dbe52f5793d8966edd778f168eaf32d56661a"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`__
|
||||
- 24.04
|
||||
- `3.12 <https://www.python.org/downloads/release/python-31210/>`__
|
||||
- `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`__
|
||||
- `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`__
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`__
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||
- `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
|
||||
- `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
|
||||
|
||||
Key ROCm libraries for PyTorch
|
||||
================================================================================
|
||||
|
||||
PyTorch functionality on ROCm is determined by its underlying library
|
||||
dependencies. These ROCm components affect the capabilities, performance, and
|
||||
feature set available to developers.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ROCm library
|
||||
- Version
|
||||
- Purpose
|
||||
- Used in
|
||||
* - `Composable Kernel <https://github.com/ROCm/composable_kernel>`__
|
||||
- :version-ref:`"Composable Kernel" rocm_version`
|
||||
- Enables faster execution of core operations like matrix multiplication
|
||||
(GEMM), convolutions and transformations.
|
||||
- Speeds up ``torch.permute``, ``torch.view``, ``torch.matmul``,
|
||||
``torch.mm``, ``torch.bmm``, ``torch.nn.Conv2d``, ``torch.nn.Conv3d``
|
||||
and ``torch.nn.MultiheadAttention``.
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
- Supports operations such as matrix multiplication, matrix-vector
|
||||
products, and tensor contractions. Utilized in both dense and batched
|
||||
linear algebra operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- hipBLASLt is an extension of the hipBLAS library, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
- Accelerates operations such as ``torch.matmul``, ``torch.mm``, and the
|
||||
matrix multiplications used in convolutional and linear layers.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`__
|
||||
- :version-ref:`hipCUB rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
- Supports operations such as ``torch.sum``, ``torch.cumsum``,
|
||||
``torch.sort`` irregular shapes often involve scanning, sorting, and
|
||||
filtering, which hipCUB handles efficiently.
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`__
|
||||
- :version-ref:`hipFFT rocm_version`
|
||||
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
|
||||
- Used in functions like the ``torch.fft`` module.
|
||||
* - `hipRAND <https://github.com/ROCm/hipRAND>`__
|
||||
- :version-ref:`hipRAND rocm_version`
|
||||
- Provides fast random number generation for GPUs.
|
||||
- The ``torch.rand``, ``torch.randn``, and stochastic layers like
|
||||
``torch.nn.Dropout`` rely on hipRAND.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`__
|
||||
- :version-ref:`hipSOLVER rocm_version`
|
||||
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
|
||||
singular value decompositions (SVD).
|
||||
- Supports functions like ``torch.linalg.solve``,
|
||||
``torch.linalg.eig``, and ``torch.linalg.svd``.
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`__
|
||||
- :version-ref:`hipSPARSE rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
- Sparse tensor operations ``torch.sparse``.
|
||||
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`__
|
||||
- :version-ref:`hipSPARSELt rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
- Sparse tensor operations ``torch.sparse``.
|
||||
* - `hipTensor <https://github.com/ROCm/hipTensor>`__
|
||||
- :version-ref:`hipTensor rocm_version`
|
||||
- Optimizes for high-performance tensor operations, such as contractions.
|
||||
- Accelerates tensor algebra, especially in deep learning and scientific
|
||||
computing.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`__
|
||||
- :version-ref:`MIOpen rocm_version`
|
||||
- Optimizes deep learning primitives such as convolutions, pooling,
|
||||
normalization, and activation functions.
|
||||
- Speeds up convolutional neural networks (CNNs), recurrent neural
|
||||
networks (RNNs), and other layers. Used in operations like
|
||||
``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
|
||||
* - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`__
|
||||
- :version-ref:`MIGraphX rocm_version`
|
||||
- Adds graph-level optimizations, ONNX models and mixed precision support
|
||||
and enable Ahead-of-Time (AOT) Compilation.
|
||||
- Speeds up inference models and executes ONNX models for
|
||||
compatibility with other frameworks.
|
||||
``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
|
||||
* - `MIVisionX <https://github.com/ROCm/MIVisionX>`__
|
||||
- :version-ref:`MIVisionX rocm_version`
|
||||
- Optimizes acceleration for computer vision and AI workloads like
|
||||
preprocessing, augmentation, and inferencing.
|
||||
- Faster data preprocessing and augmentation pipelines for datasets like
|
||||
ImageNet or COCO and easy to integrate into PyTorch's ``torch.utils.data``
|
||||
and ``torchvision`` workflows.
|
||||
* - `rocAL <https://github.com/ROCm/rocAL>`__
|
||||
- :version-ref:`rocAL rocm_version`
|
||||
- Accelerates the data pipeline by offloading intensive preprocessing and
|
||||
augmentation tasks. rocAL is part of MIVisionX.
|
||||
- Easy to integrate into PyTorch's ``torch.utils.data`` and
|
||||
``torchvision`` data load workloads.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`__
|
||||
- :version-ref:`RCCL rocm_version`
|
||||
- Optimizes for multi-GPU communication for operations like AllReduce and
|
||||
Broadcast.
|
||||
- Distributed data parallel training (``torch.nn.parallel.DistributedDataParallel``).
|
||||
Handles communication in multi-GPU setups.
|
||||
* - `rocDecode <https://github.com/ROCm/rocDecode>`__
|
||||
- :version-ref:`rocDecode rocm_version`
|
||||
- Provides hardware-accelerated data decoding capabilities, particularly
|
||||
for image, video, and other dataset formats.
|
||||
- Can be integrated in ``torch.utils.data``, ``torchvision.transforms``
|
||||
and ``torch.distributed``.
|
||||
* - `rocJPEG <https://github.com/ROCm/rocJPEG>`__
|
||||
- :version-ref:`rocJPEG rocm_version`
|
||||
- Provides hardware-accelerated JPEG image decoding and encoding.
|
||||
- GPU accelerated ``torchvision.io.decode_jpeg`` and
|
||||
``torchvision.io.encode_jpeg`` and can be integrated in
|
||||
``torch.utils.data`` and ``torchvision``.
|
||||
* - `RPP <https://github.com/ROCm/RPP>`__
|
||||
- :version-ref:`RPP rocm_version`
|
||||
- Speeds up data augmentation, transformation, and other preprocessing steps.
|
||||
- Easy to integrate into PyTorch's ``torch.utils.data`` and
|
||||
``torchvision`` data load workloads to speed up data processing.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`__
|
||||
- :version-ref:`rocThrust rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
- Utilized in backend operations for tensor computations requiring
|
||||
parallel processing.
|
||||
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`__
|
||||
- :version-ref:`rocWMMA rocm_version`
|
||||
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
|
||||
multiplication (GEMM) and accumulation operations with mixed precision
|
||||
support.
|
||||
- Linear layers (``torch.nn.Linear``), convolutional layers
|
||||
(``torch.nn.Conv2d``), attention layers, general tensor operations that
|
||||
involve matrix products, such as ``torch.matmul``, ``torch.bmm``, and
|
||||
more.
|
||||
|
||||
Supported modules and data types
|
||||
================================================================================
|
||||
|
||||
The following section outlines the supported data types, modules, and domain
|
||||
libraries available in PyTorch on ROCm.
|
||||
|
||||
Supported data types
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The tensor data type is specified using the ``dtype`` attribute or argument.
|
||||
PyTorch supports many data types for different use cases.
|
||||
|
||||
The following table lists `torch.Tensor <https://pytorch.org/docs/stable/tensors.html>`__
|
||||
single data types:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
* - ``torch.float8_e4m3fn``
|
||||
- 8-bit floating point, e4m3
|
||||
* - ``torch.float8_e5m2``
|
||||
- 8-bit floating point, e5m2
|
||||
* - ``torch.float16`` or ``torch.half``
|
||||
- 16-bit floating point
|
||||
* - ``torch.bfloat16``
|
||||
- 16-bit floating point
|
||||
* - ``torch.float32`` or ``torch.float``
|
||||
- 32-bit floating point
|
||||
* - ``torch.float64`` or ``torch.double``
|
||||
- 64-bit floating point
|
||||
* - ``torch.complex32`` or ``torch.chalf``
|
||||
- 32-bit complex numbers
|
||||
* - ``torch.complex64`` or ``torch.cfloat``
|
||||
- 64-bit complex numbers
|
||||
* - ``torch.complex128`` or ``torch.cdouble``
|
||||
- 128-bit complex numbers
|
||||
* - ``torch.uint8``
|
||||
- 8-bit integer (unsigned)
|
||||
* - ``torch.uint16``
|
||||
- 16-bit integer (unsigned);
|
||||
Not natively supported in ROCm
|
||||
* - ``torch.uint32``
|
||||
- 32-bit integer (unsigned);
|
||||
Not natively supported in ROCm
|
||||
* - ``torch.uint64``
|
||||
- 64-bit integer (unsigned);
|
||||
Not natively supported in ROCm
|
||||
* - ``torch.int8``
|
||||
- 8-bit integer (signed)
|
||||
* - ``torch.int16`` or ``torch.short``
|
||||
- 16-bit integer (signed)
|
||||
* - ``torch.int32`` or ``torch.int``
|
||||
- 32-bit integer (signed)
|
||||
* - ``torch.int64`` or ``torch.long``
|
||||
- 64-bit integer (signed)
|
||||
* - ``torch.bool``
|
||||
- Boolean
|
||||
* - ``torch.quint8``
|
||||
- Quantized 8-bit integer (unsigned)
|
||||
* - ``torch.qint8``
|
||||
- Quantized 8-bit integer (signed)
|
||||
* - ``torch.qint32``
|
||||
- Quantized 32-bit integer (signed)
|
||||
* - ``torch.quint4x2``
|
||||
- Quantized 4-bit integer (unsigned)
|
||||
|
||||
.. note::
|
||||
|
||||
Unsigned types, except ``uint8``, have limited support in eager mode. They
|
||||
primarily exist to assist usage with ``torch.compile``.
|
||||
|
||||
See :doc:`ROCm precision support <rocm:reference/precision-support>` for the
|
||||
native hardware support of data types.
|
||||
|
||||
Supported modules
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
For a complete and up-to-date list of PyTorch core modules (for example., ``torch``,
|
||||
``torch.nn``, ``torch.cuda``, ``torch.backends.cuda`` and
|
||||
``torch.backends.cudnn``), their descriptions, and usage, please refer directly
|
||||
to the `official PyTorch documentation <https://pytorch.org/docs/stable/index.html>`_.
|
||||
|
||||
Core PyTorch functionality on ROCm includes tensor operations, neural network
|
||||
layers, automatic differentiation, distributed training, mixed-precision
|
||||
training, compilation features, and domain-specific libraries for audio, vision,
|
||||
text processing, and more.
|
||||
|
||||
Supported domain libraries
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
PyTorch offers specialized `domain libraries <https://pytorch.org/domains/>`_ with
|
||||
GPU acceleration that build on its core features to support specific application
|
||||
areas. The table below lists the PyTorch domain libraries that are compatible
|
||||
with ROCm.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Description
|
||||
|
||||
* - `torchaudio <https://docs.pytorch.org/audio/stable/index.html>`_
|
||||
- Audio and signal processing library for PyTorch. Provides utilities for
|
||||
audio I/O, signal and data processing functions, datasets, model
|
||||
implementations, and application components for audio and speech
|
||||
processing tasks.
|
||||
|
||||
**Note:** To ensure GPU-acceleration with ``torchaudio.transforms``,
|
||||
you need to explicitly move audio data (waveform tensor) to GPU using
|
||||
``.to('cuda')``.
|
||||
|
||||
* - `torchtune <https://docs.pytorch.org/torchtune/stable/index.html>`_
|
||||
- PyTorch-native library designed for fine-tuning large language models
|
||||
(LLMs). Provides supports the full fine-tuning workflow and offers
|
||||
compatibility with popular production inference systems.
|
||||
|
||||
**Note:** Only official release exists.
|
||||
|
||||
* - `torchvision <https://docs.pytorch.org/vision/stable/index.html>`_
|
||||
- Computer vision library that is part of the PyTorch project. Provides
|
||||
popular datasets, model architectures, and common image transformations
|
||||
for computer vision applications.
|
||||
|
||||
* - `torchtext <https://docs.pytorch.org/text/stable/index.html>`_
|
||||
- Text processing library for PyTorch. Provides data processing utilities
|
||||
and popular datasets for natural language processing, including
|
||||
tokenization, vocabulary management, and text embeddings.
|
||||
|
||||
**Note:** ``torchtext`` does not implement ROCm-specific kernels.
|
||||
ROCm acceleration is provided through the underlying PyTorch framework
|
||||
and ROCm library integration. Only official release exists.
|
||||
|
||||
* - `torchdata <https://docs.pytorch.org/data/beta/index.html>`_
|
||||
- Beta library of common modular data loading primitives for easily
|
||||
constructing flexible and performant data pipelines, with features still
|
||||
in prototype stage.
|
||||
|
||||
* - `torchrec <https://docs.pytorch.org/torchrec/>`_
|
||||
- PyTorch domain library for common sparsity and parallelism primitives
|
||||
needed for large-scale recommender systems, enabling authors to train
|
||||
models with large embedding tables shared across many GPUs.
|
||||
|
||||
**Note:** ``torchrec`` does not implement ROCm-specific kernels. ROCm
|
||||
acceleration is provided through the underlying PyTorch framework and
|
||||
ROCm library integration.
|
||||
|
||||
* - `torchserve <https://docs.pytorch.org/serve/>`_
|
||||
- Performant, flexible and easy-to-use tool for serving PyTorch models in
|
||||
production, providing features for model management, batch processing,
|
||||
and scalable deployment.
|
||||
|
||||
**Note:** `torchserve <https://docs.pytorch.org/serve/>`_ is no longer
|
||||
actively maintained. Last official release is sent out with PyTorch 2.4.
|
||||
|
||||
* - `torchrl <https://docs.pytorch.org/rl/stable/index.html>`_
|
||||
- Open-source, Python-first Reinforcement Learning library for PyTorch
|
||||
with a focus on high modularity and good runtime performance, providing
|
||||
low and high-level RL abstractions and reusable functionals for cost
|
||||
functions, returns, and data processing.
|
||||
|
||||
**Note:** Only official release exists.
|
||||
|
||||
* - `tensordict <https://docs.pytorch.org/tensordict/stable/index.html>`_
|
||||
- Dictionary-like class that simplifies operations on batches of tensors,
|
||||
enhancing code readability, compactness, and modularity by abstracting
|
||||
tailored operations and reducing errors through automatic operation
|
||||
dispatching.
|
||||
|
||||
**Note:** Only official release exists.
|
||||
|
||||
Key features and enhancements for PyTorch 2.7 with ROCm 7.0
|
||||
================================================================================
|
||||
|
||||
- Enhanced TunableOp framework: Introduces ``tensorfloat32`` support for
|
||||
TunableOp operations, improved offline tuning for ScaledGEMM operations,
|
||||
submatrix offline tuning capabilities, and better logging for BLAS operations
|
||||
without bias vectors.
|
||||
|
||||
- Expanded GPU architecture support: Provides optimized support for newer GPU
|
||||
architectures, including gfx1200 and gfx1201 with preferred hipBLASLt backend
|
||||
selection, along with improvements for gfx950 and gfx1100 series GPUs.
|
||||
|
||||
- Advanced Triton Integration: AOTriton 0.10b introduces official support for
|
||||
gfx950 and gfx1201, along with experimental support for gfx1101, gfx1151,
|
||||
gfx1150, and gfx1200.
|
||||
|
||||
- Improved element-wise kernel performance: Delivers enhanced vectorized
|
||||
element-wise kernels with better support for heterogeneous tensor types and
|
||||
optimized input vectorization for tensors with mixed data types.
|
||||
|
||||
- MIOpen deep learning optimizations: Enables NHWC BatchNorm by default on
|
||||
ROCm 7.0+, provides ``maxpool`` forward and backward performance improvements
|
||||
targeting ResNet scenarios, and includes updated launch configurations for
|
||||
better performance.
|
||||
|
||||
- Enhanced memory and tensor operations: Features fixes for in-place ``aten``
|
||||
sum operations with specialized templated kernels, improved 3D tensor
|
||||
performance with NHWC format, and better handling of memory-bound matrix
|
||||
multiplication operations.
|
||||
|
||||
- Robust testing and quality improvements: Includes comprehensive test suite
|
||||
updates with improved tolerance handling for Navi3x architectures, generalized
|
||||
ROCm-specific test conditions, and enhanced unit test coverage for Flash
|
||||
Attention and Memory Efficient operations.
|
||||
|
||||
- Build system and infrastructure improvements: Provides updated CentOS Stream 9
|
||||
support, improved Docker configuration, migration to public MAGMA repository,
|
||||
and enhanced QA automation scripts for PyTorch unit testing.
|
||||
|
||||
- Composable Kernel (CK) updates: Features updated CK submodule integration with
|
||||
the latest optimizations and performance improvements for core mathematical
|
||||
operations.
|
||||
|
||||
- Development and debugging enhancements: Includes improved source handling for
|
||||
dynamic compilation, better error handling for atomic operations, and enhanced
|
||||
state checking for trace operations.
|
||||
|
||||
- Integrate APEX fused layer normalization, which can have positive impact on
|
||||
text-to-video models.
|
||||
|
||||
- Integrate APEX distributed fused LAMB and distributed fused ADAM, which can
|
||||
have positive impact on BERT-L and Llama2-SFT.
|
||||
|
||||
- FlashAttention v3 has been integrated for AMD GPUs.
|
||||
|
||||
- `Pytorch C++ extensions <https://pytorch.org/tutorials/advanced/cpp_extension.html>`_
|
||||
provide a mechanism for compiling custom operations that can be used during
|
||||
network training or inference. For AMD platforms, ``amdclang++`` has been
|
||||
validated as the supported compiler for building these extensions.
|
||||
|
||||
Known issues and notes for PyTorch 2.7 with ROCm 7.0
|
||||
================================================================================
|
||||
|
||||
- The ``matmul.allow_fp16_reduced_precision_reduction`` and
|
||||
``matmul.allow_bf16_reduced_precision_reduction`` options under
|
||||
``torch.backends.cuda`` are not supported. As a result,
|
||||
reduced-precision reductions using FP16 or BF16 accumulation types are not
|
||||
available.
|
||||
@@ -1,111 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Ray deep learning framework compatibility
|
||||
:keywords: GPU, Ray compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
*******************************************************************************
|
||||
Ray compatibility
|
||||
*******************************************************************************
|
||||
|
||||
Ray is a unified framework for scaling AI and Python applications from your laptop
|
||||
to a full cluster, without changing your code. Ray consists of `a core distributed
|
||||
runtime <https://docs.ray.io/en/latest/ray-core/walkthrough.html>`_ and a set of
|
||||
`AI libraries <https://docs.ray.io/en/latest/ray-air/getting-started.html>`_ for
|
||||
simplifying machine learning computations.
|
||||
|
||||
Ray is a general-purpose framework that runs many types of workloads efficiently.
|
||||
Any Python application can be scaled with Ray, without extra infrastructure.
|
||||
|
||||
ROCm support for Ray is upstreamed, and you can build the official source code
|
||||
with ROCm support:
|
||||
|
||||
- ROCm support for Ray is hosted in the official `https://github.com/ROCm/ray
|
||||
<https://github.com/ROCm/ray>`_ repository.
|
||||
|
||||
- Due to independent compatibility considerations, this location differs from the
|
||||
`https://github.com/ray-project/ray <https://github.com/ray-project/ray>`_ upstream repository.
|
||||
|
||||
- To install Ray, use the prebuilt :ref:`Docker image <ray-docker-compat>`
|
||||
which includes ROCm, Ray, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>`
|
||||
for instructions to get started.
|
||||
|
||||
- See the `Installation section <https://docs.ray.io/en/latest/ray-overview/installation.html>`_
|
||||
in the upstream Ray documentation.
|
||||
|
||||
- The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__
|
||||
corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
|
||||
|
||||
.. note::
|
||||
|
||||
Ray is supported on ROCm 6.4.1.
|
||||
|
||||
Supported devices
|
||||
================================================================================
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X, MI210
|
||||
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* The `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm
|
||||
Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__
|
||||
blog provides an overview of Volcano Engine Reinforcement Learning (verl)
|
||||
for large language models (LLMs) and discusses its benefits in large-scale
|
||||
reinforcement learning from human feedback (RLHF). It uses Ray as part of a
|
||||
hybrid orchestration engine to schedule and coordinate training and inference
|
||||
tasks in parallel, enabling optimized resource utilization and potential overlap
|
||||
between these phases. This dynamic resource allocation strategy significantly
|
||||
improves overall system efficiency. The blog presents verl’s performance results,
|
||||
focusing on throughput and convergence accuracy achieved on AMD Instinct™ MI300X
|
||||
GPUs. Follow this guide to get started with verl on AMD Instinct GPUs and
|
||||
accelerate your RLHF training with ROCm-optimized performance.
|
||||
|
||||
* The `Exploring Use Cases for Scalable AI: Implementing Ray with ROCm Support for Efficient ML Workflows
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/rocm-ray/README.html>`__
|
||||
blog post describes key use cases such as training and inference for large language models (LLMs),
|
||||
model serving, hyperparameter tuning, reinforcement learning, and the orchestration of large-scale
|
||||
workloads using Ray in the ROCm environment.
|
||||
|
||||
For more use cases and recommendations, see the AMD GPU tabs in the `Accelerator Support
|
||||
topic <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#accelerator-support>`__
|
||||
of the Ray core documentation and refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||
where you can search for Ray examples and best practices to optimize your workloads on AMD GPUs.
|
||||
|
||||
.. _ray-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories represent the latest Ray version from the official Docker Hub and are validated for
|
||||
`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
|
||||
icon to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- Ray
|
||||
- Pytorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
|
||||
- `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
|
||||
- 2.6.0+git684f6f2
|
||||
- 24.04
|
||||
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
|
||||
@@ -1,100 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Stanford Megatron-LM compatibility
|
||||
:keywords: Stanford, Megatron-LM, compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
********************************************************************************
|
||||
Stanford Megatron-LM compatibility
|
||||
********************************************************************************
|
||||
|
||||
Stanford Megatron-LM is a large-scale language model training framework developed by NVIDIA `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_. It is
|
||||
designed to train massive transformer-based language models efficiently by model and data parallelism.
|
||||
|
||||
* ROCm support for Stanford Megatron-LM is hosted in the official `https://github.com/ROCm/Stanford-Megatron-LM <https://github.com/ROCm/Stanford-Megatron-LM>`_ repository.
|
||||
* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository.
|
||||
* Use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>` with ROCm, PyTorch, and Megatron-LM preinstalled.
|
||||
* See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>` to install and get started.
|
||||
|
||||
.. note::
|
||||
|
||||
Stanford Megatron-LM is supported on ROCm 6.3.0.
|
||||
|
||||
|
||||
Supported Devices
|
||||
================================================================================
|
||||
|
||||
- **Officially Supported**: AMD Instinct MI300X
|
||||
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210X
|
||||
|
||||
|
||||
Supported models and features
|
||||
================================================================================
|
||||
|
||||
This section details models & features that are supported by the ROCm version on Stanford Megatron-LM.
|
||||
|
||||
Models:
|
||||
|
||||
* Bert
|
||||
* GPT
|
||||
* T5
|
||||
* ICT
|
||||
|
||||
Features:
|
||||
|
||||
* Distributed Pre-training
|
||||
* Activation Checkpointing and Recomputation
|
||||
* Distributed Optimizer
|
||||
* Mixture-of-Experts
|
||||
|
||||
.. _megatron-lm-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
See the `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs blog <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_ post
|
||||
to leverage the ROCm platform for pre-training by using the Stanford Megatron-LM framework of pre-processing datasets on AMD GPUs.
|
||||
Coverage includes:
|
||||
|
||||
* Single-GPU pre-training
|
||||
* Multi-GPU pre-training
|
||||
|
||||
|
||||
.. _megatron-lm-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/megatron-lm>`_
|
||||
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the latest Megatron-LM version from the official Docker Hub.
|
||||
The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- Stanford Megatron-LM
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/stanford-megatron-lm/stanford-megatron-lm85f95ae_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-070556f078be10888a1421a2cb4f48c29f28b02bfeddae02588d1f7fc02a96a6"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
|
||||
|
||||
|
||||
@@ -1,76 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Taichi compatibility
|
||||
:keywords: GPU, Taichi compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
*******************************************************************************
|
||||
Taichi compatibility
|
||||
*******************************************************************************
|
||||
|
||||
`Taichi <https://www.taichi-lang.org/>`_ is an open-source, imperative, and parallel
|
||||
programming language designed for high-performance numerical computation.
|
||||
Embedded in Python, it leverages just-in-time (JIT) compilation frameworks such as LLVM to accelerate
|
||||
compute-intensive Python code by compiling it to native GPU or CPU instructions.
|
||||
|
||||
Taichi is widely used across various domains, including real-time physical simulation,
|
||||
numerical computing, augmented reality, artificial intelligence, computer vision, robotics,
|
||||
visual effects in film and gaming, and general-purpose computing.
|
||||
|
||||
* ROCm support for Taichi is hosted in the official `https://github.com/ROCm/taichi <https://github.com/ROCm/taichi>`_ repository.
|
||||
* Due to independent compatibility considerations, this location differs from the `https://github.com/taichi-dev <https://github.com/taichi-dev>`_ upstream repository.
|
||||
* Use the prebuilt :ref:`Docker image <taichi-docker-compat>` with ROCm, PyTorch, and Taichi preinstalled.
|
||||
* See the :doc:`ROCm Taichi installation guide <rocm-install-on-linux:install/3rd-party/taichi-install>` to install and get started.
|
||||
|
||||
.. note::
|
||||
|
||||
Taichi is supported on ROCm 6.3.2.
|
||||
|
||||
Supported devices and features
|
||||
===============================================================================
|
||||
There is support through the ROCm software stack for all Taichi GPU features on AMD Instinct MI250X and MI210X series GPUs with the exception of Taichi’s GPU rendering system, CGUI.
|
||||
AMD Instinct MI300X series GPUs will be supported by November.
|
||||
|
||||
.. _taichi-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
To fully leverage Taichi's performance capabilities in compute-intensive tasks, it is best to adhere to specific coding patterns and utilize Taichi decorators.
|
||||
A collection of example use cases is available in the `https://github.com/ROCm/taichi_examples <https://github.com/ROCm/taichi_examples>`_ repository,
|
||||
providing practical insights and foundational knowledge for working with the Taichi programming language.
|
||||
You can also refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_ to search for Taichi examples and best practices to optimize your workflows on AMD GPUs.
|
||||
|
||||
.. _taichi-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes ready-made `ROCm Taichi Docker images <https://hub.docker.com/r/rocm/taichi/tags>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and associated inventories
|
||||
represent the latest Taichi version from the official Docker Hub.
|
||||
The Docker images have been validated for `ROCm 6.3.2 <https://rocm.docs.amd.com/en/docs-6.3.2/about/release-notes.html>`_.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Taichi
|
||||
- Ubuntu
|
||||
- Python
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/taichi/taichi-1.8.0b1_rocm6.3.2_ubuntu22.04_py3.10.12/images/sha256-e016964a751e6a92199032d23e70fa3a564fff8555afe85cd718f8aa63f11fc6"><i class="fab fa-docker fa-lg"></i> rocm/taichi</a>
|
||||
- `6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`_
|
||||
- `1.8.0b1 <https://github.com/taichi-dev/taichi>`_
|
||||
- 22.04
|
||||
- `3.10.12 <https://www.python.org/downloads/release/python-31012/>`_
|
||||
@@ -1,504 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: TensorFlow compatibility
|
||||
:keywords: GPU, TensorFlow compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
*******************************************************************************
|
||||
TensorFlow compatibility
|
||||
*******************************************************************************
|
||||
|
||||
`TensorFlow <https://www.tensorflow.org/>`__ is an open-source library for
|
||||
solving machine learning, deep learning, and AI problems. It can solve many
|
||||
problems across different sectors and industries but primarily focuses on
|
||||
neural network training and inference. It is one of the most popular and
|
||||
in-demand frameworks and is very active in open-source contribution and
|
||||
development.
|
||||
|
||||
The `official TensorFlow repository <http://github.com/tensorflow/tensorflow>`__
|
||||
includes full ROCm support. AMD maintains a TensorFlow `ROCm repository
|
||||
<http://github.com/rocm/tensorflow-upstream>`__ in order to quickly add bug
|
||||
fixes, updates, and support for the latest ROCM versions.
|
||||
|
||||
- ROCm TensorFlow release:
|
||||
|
||||
- Offers :ref:`Docker images <tensorflow-docker-compat>` with
|
||||
ROCm and TensorFlow pre-installed.
|
||||
|
||||
- ROCm TensorFlow repository: `<https://github.com/ROCm/tensorflow-upstream>`__
|
||||
|
||||
- See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
||||
to get started.
|
||||
|
||||
- Official TensorFlow release:
|
||||
|
||||
- Official TensorFlow repository: `<https://github.com/tensorflow/tensorflow>`__
|
||||
|
||||
- See the `TensorFlow API versions <https://www.tensorflow.org/versions>`__ list.
|
||||
|
||||
.. note::
|
||||
|
||||
The official TensorFlow documentation does not cover ROCm support. Use the
|
||||
ROCm documentation for installation instructions for Tensorflow on ROCm.
|
||||
See :doc:`rocm-install-on-linux:install/3rd-party/tensorflow-install`.
|
||||
|
||||
.. _tensorflow-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
===============================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes ready-made `TensorFlow images
|
||||
<https://hub.docker.com/r/rocm/tensorflow>`__ with ROCm backends on
|
||||
Docker Hub. The following Docker image tags and associated inventories are
|
||||
validated for `ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__. Click
|
||||
the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
.. list-table:: TensorFlow Docker image components
|
||||
:header-rows: 1
|
||||
|
||||
* - Docker image
|
||||
- TensorFlow
|
||||
- Ubuntu
|
||||
- Python
|
||||
- TensorBoard
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.18-dev/images/sha256-96754ce2d30f729e19b497279915b5212ba33d5e408e7e5dd3f2304d87e3441e"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||
- 24.04
|
||||
- `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
|
||||
- `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.18-dev/images/sha256-fa741508d383858e86985a9efac85174529127408102558ae2e3a4ac894eea1e"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
||||
- 22.04
|
||||
- `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
|
||||
- `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.17-dev/images/sha256-3a0aef09f2a8833c2b64b85874dd9449ffc2ad257351857338ff5b706c03a418"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||
- 24.04
|
||||
- `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
|
||||
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.17-dev/images/sha256-bc7341a41ebe7ab261aa100732874507c452421ef733e408ac4f05ed453b0bc5"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
||||
- 22.04
|
||||
- `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
|
||||
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.16-dev/images/sha256-4841a8df7c340dab79bf9362dad687797649a00d594e0832eb83ea6880a40d3b"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||
- 24.04
|
||||
- `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
|
||||
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.16-dev/images/sha256-883fa95aba960c58a3e46fceaa18f03ede2c7df89b8e9fd603ab2d47e0852897"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.16.2-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
||||
- 22.04
|
||||
- `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
|
||||
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`__
|
||||
|
||||
|
||||
Critical ROCm libraries for TensorFlow
|
||||
===============================================================================
|
||||
|
||||
TensorFlow depends on multiple components and the supported features of those
|
||||
components can affect the TensorFlow ROCm supported feature set. The versions
|
||||
in the following table refer to the first TensorFlow version where the ROCm
|
||||
library was introduced as a dependency. The versions described
|
||||
are available in ROCm :version:`rocm_version`.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25, 10, 35, 30
|
||||
:header-rows: 1
|
||||
|
||||
* - ROCm library
|
||||
- Version
|
||||
- Purpose
|
||||
- Used in
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
- Accelerates operations like ``tf.matmul``, ``tf.linalg.matmul``, and
|
||||
other matrix multiplications commonly used in neural network layers.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- Extends hipBLAS with additional optimizations like fused kernels and
|
||||
integer tensor cores.
|
||||
- Optimizes matrix multiplications and linear algebra operations used in
|
||||
layers like dense, convolutional, and RNNs in TensorFlow.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`__
|
||||
- :version-ref:`hipCUB rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
- Supports operations like ``tf.reduce_sum``, ``tf.cumsum``, ``tf.sort``
|
||||
and other tensor operations in TensorFlow, especially those involving
|
||||
scanning, sorting, and filtering.
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`__
|
||||
- :version-ref:`hipFFT rocm_version`
|
||||
- Accelerates Fast Fourier Transforms (FFT) for signal processing tasks.
|
||||
- Used for operations like signal processing, image filtering, and
|
||||
certain types of neural networks requiring FFT-based transformations.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`__
|
||||
- :version-ref:`hipSOLVER rocm_version`
|
||||
- Provides GPU-accelerated direct linear solvers for dense and sparse
|
||||
systems.
|
||||
- Optimizes linear algebra functions such as solving systems of linear
|
||||
equations, often used in optimization and training tasks.
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`__
|
||||
- :version-ref:`hipSPARSE rocm_version`
|
||||
- Optimizes sparse matrix operations for efficient computations on sparse
|
||||
data.
|
||||
- Accelerates sparse matrix operations in models with sparse weight
|
||||
matrices or activations, commonly used in neural networks.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`__
|
||||
- :version-ref:`MIOpen rocm_version`
|
||||
- Provides optimized deep learning primitives such as convolutions,
|
||||
pooling,
|
||||
normalization, and activation functions.
|
||||
- Speeds up convolutional neural networks (CNNs) and other layers. Used
|
||||
in TensorFlow for layers like ``tf.nn.conv2d``, ``tf.nn.relu``, and
|
||||
``tf.nn.lstm_cell``.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`__
|
||||
- :version-ref:`RCCL rocm_version`
|
||||
- Optimizes for multi-GPU communication for operations like AllReduce and
|
||||
Broadcast.
|
||||
- Distributed data parallel training (``tf.distribute.MirroredStrategy``).
|
||||
Handles communication in multi-GPU setups.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`__
|
||||
- :version-ref:`rocThrust rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
- Reduction operations like ``tf.reduce_sum``, ``tf.cumsum`` for computing
|
||||
the cumulative sum of elements along a given axis or ``tf.unique`` to
|
||||
finds unique elements in a tensor can use rocThrust.
|
||||
|
||||
Supported and unsupported features
|
||||
===============================================================================
|
||||
|
||||
The following section maps supported data types and GPU-accelerated TensorFlow
|
||||
features to their minimum supported ROCm and TensorFlow versions.
|
||||
|
||||
Data types
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The data type of a tensor is specified using the ``dtype`` attribute or
|
||||
argument, and TensorFlow supports a wide range of data types for different use
|
||||
cases.
|
||||
|
||||
The basic, single data types of `tf.dtypes <https://www.tensorflow.org/api_docs/python/tf/dtypes>`__
|
||||
are as follows:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
- Since TensorFlow
|
||||
- Since ROCm
|
||||
* - ``bfloat16``
|
||||
- 16-bit bfloat (brain floating point).
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``bool``
|
||||
- Boolean.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``complex128``
|
||||
- 128-bit complex.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``complex64``
|
||||
- 64-bit complex.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``double``
|
||||
- 64-bit (double precision) floating-point.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``float16``
|
||||
- 16-bit (half precision) floating-point.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``float32``
|
||||
- 32-bit (single precision) floating-point.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``float64``
|
||||
- 64-bit (double precision) floating-point.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``half``
|
||||
- 16-bit (half precision) floating-point.
|
||||
- 2.0.0
|
||||
- 2.0
|
||||
* - ``int16``
|
||||
- Signed 16-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``int32``
|
||||
- Signed 32-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``int64``
|
||||
- Signed 64-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``int8``
|
||||
- Signed 8-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``qint16``
|
||||
- Signed quantized 16-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``qint32``
|
||||
- Signed quantized 32-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``qint8``
|
||||
- Signed quantized 8-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``quint16``
|
||||
- Unsigned quantized 16-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``quint8``
|
||||
- Unsigned quantized 8-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``resource``
|
||||
- Handle to a mutable, dynamically allocated resource.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``string``
|
||||
- Variable-length string, represented as byte array.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``uint16``
|
||||
- Unsigned 16-bit (word) integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``uint32``
|
||||
- Unsigned 32-bit (dword) integer.
|
||||
- 1.5.0
|
||||
- 1.7
|
||||
* - ``uint64``
|
||||
- Unsigned 64-bit (qword) integer.
|
||||
- 1.5.0
|
||||
- 1.7
|
||||
* - ``uint8``
|
||||
- Unsigned 8-bit (byte) integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``variant``
|
||||
- Data of arbitrary type (known at runtime).
|
||||
- 1.4.0
|
||||
- 1.7
|
||||
|
||||
Features
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
This table provides an overview of key features in TensorFlow and their
|
||||
availability in ROCm.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Module
|
||||
- Description
|
||||
- Since TensorFlow
|
||||
- Since ROCm
|
||||
* - ``tf.linalg`` (Linear Algebra)
|
||||
- Operations for matrix and tensor computations, such as
|
||||
``tf.linalg.matmul`` (matrix multiplication), ``tf.linalg.inv``
|
||||
(matrix inversion) and ``tf.linalg.cholesky`` (Cholesky decomposition).
|
||||
These leverage GPUs for high-performance linear algebra operations.
|
||||
- 1.4
|
||||
- 1.8.2
|
||||
* - ``tf.nn`` (Neural Network Operations)
|
||||
- GPU-accelerated building blocks for deep learning models, such as 2D
|
||||
convolutions with ``tf.nn.conv2d``, max pooling operations with
|
||||
``tf.nn.max_pool``, activation functions like ``tf.nn.relu`` or softmax
|
||||
for output layers with ``tf.nn.softmax``.
|
||||
- 1.0
|
||||
- 1.8.2
|
||||
* - ``tf.image`` (Image Processing)
|
||||
- GPU-accelerated functions for image preprocessing and augmentations,
|
||||
such as resize images with ``tf.image.resize``, flip images horizontally
|
||||
with ``tf.image.flip_left_right`` and adjust image brightness randomly
|
||||
with ``tf.image.random_brightness``.
|
||||
- 1.1
|
||||
- 1.8.2
|
||||
* - ``tf.keras`` (High-Level API)
|
||||
- GPU acceleration for Keras layers and models, including dense layers
|
||||
(``tf.keras.layers.Dense``), convolutional layers
|
||||
(``tf.keras.layers.Conv2D``) and recurrent layers
|
||||
(``tf.keras.layers.LSTM``).
|
||||
- 1.4
|
||||
- 1.8.2
|
||||
* - ``tf.math`` (Mathematical Operations)
|
||||
- GPU-accelerated mathematical operations, such as sum across dimensions
|
||||
with ``tf.math.reduce_sum``, elementwise exponentiation with
|
||||
``tf.math.exp`` and sigmoid activation (``tf.math.sigmoid``).
|
||||
- 1.5
|
||||
- 1.8.2
|
||||
* - ``tf.signal`` (Signal Processing)
|
||||
- Functions for spectral analysis and signal transformations.
|
||||
- 1.13
|
||||
- 2.1
|
||||
* - ``tf.data`` (Data Input Pipeline)
|
||||
- GPU-accelerated data preprocessing for efficient input pipelines,
|
||||
Prefetching with ``tf.data.experimental.AUTOTUNE``. GPU-enabled
|
||||
transformations like map and batch.
|
||||
- 1.4
|
||||
- 1.8.2
|
||||
* - ``tf.distribute`` (Distributed Training)
|
||||
- Enabling to scale computations across multiple devices on a single
|
||||
machine or across multiple machines.
|
||||
- 1.13
|
||||
- 2.1
|
||||
* - ``tf.random`` (Random Number Generation)
|
||||
- GPU-accelerated random number generation
|
||||
- 1.12
|
||||
- 1.9.2
|
||||
* - ``tf.TensorArray`` (Dynamic Array Operations)
|
||||
- Enables dynamic tensor manipulation on GPUs.
|
||||
- 1.0
|
||||
- 1.8.2
|
||||
* - ``tf.sparse`` (Sparse Tensor Operations)
|
||||
- GPU-accelerated sparse matrix manipulations.
|
||||
- 1.9
|
||||
- 1.9.0
|
||||
* - ``tf.experimental.numpy``
|
||||
- GPU-accelerated NumPy-like API for numerical computations.
|
||||
- 2.4
|
||||
- 4.1.1
|
||||
* - ``tf.RaggedTensor``
|
||||
- Handling of variable-length sequences and ragged tensors with GPU
|
||||
support.
|
||||
- 1.13
|
||||
- 2.1
|
||||
* - ``tf.function`` with XLA (Accelerated Linear Algebra)
|
||||
- Enable GPU-accelerated functions in optimization.
|
||||
- 1.14
|
||||
- 2.4
|
||||
* - ``tf.quantization``
|
||||
- Quantized operations for inference, accelerated on GPUs.
|
||||
- 1.12
|
||||
- 1.9.2
|
||||
|
||||
Distributed library features
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Enables developers to scale computations across multiple devices on a single machine or
|
||||
across multiple machines.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Feature
|
||||
- Description
|
||||
- Since TensorFlow
|
||||
- Since ROCm
|
||||
* - ``MultiWorkerMirroredStrategy``
|
||||
- Synchronous training across multiple workers using mirrored variables.
|
||||
- 2.0
|
||||
- 3.0
|
||||
* - ``MirroredStrategy``
|
||||
- Synchronous training across multiple GPUs on one machine.
|
||||
- 1.5
|
||||
- 2.5
|
||||
* - ``TPUStrategy``
|
||||
- Efficiently trains models on Google TPUs.
|
||||
- 1.9
|
||||
- ❌
|
||||
* - ``ParameterServerStrategy``
|
||||
- Asynchronous training using parameter servers for variable management.
|
||||
- 2.1
|
||||
- 4.0
|
||||
* - ``CentralStorageStrategy``
|
||||
- Keeps variables on a single device and performs computation on multiple
|
||||
devices.
|
||||
- 2.3
|
||||
- 4.1
|
||||
* - ``CollectiveAllReduceStrategy``
|
||||
- Synchronous training across multiple devices and hosts.
|
||||
- 1.14
|
||||
- 3.5
|
||||
* - Distribution Strategies API
|
||||
- High-level API to simplify distributed training configuration and
|
||||
execution.
|
||||
- 1.10
|
||||
- 3.0
|
||||
|
||||
Unsupported TensorFlow features
|
||||
===============================================================================
|
||||
|
||||
The following are GPU-accelerated TensorFlow features not currently supported by
|
||||
ROCm.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Feature
|
||||
- Description
|
||||
- Since TensorFlow
|
||||
* - Mixed Precision with TF32
|
||||
- Mixed precision with TF32 is used for matrix multiplications,
|
||||
convolutions, and other linear algebra operations, particularly in
|
||||
deep learning workloads like CNNs and transformers.
|
||||
- 2.4
|
||||
* - ``tf.distribute.TPUStrategy``
|
||||
- Efficiently trains models on Google TPUs.
|
||||
- 1.9
|
||||
|
||||
Use cases and recommendations
|
||||
===============================================================================
|
||||
|
||||
* The `Training a Neural Collaborative Filtering (NCF) Recommender on an AMD
|
||||
GPU <https://rocm.blogs.amd.com/artificial-intelligence/ncf/README.html>`__
|
||||
blog post discusses training an NCF recommender system using TensorFlow. It
|
||||
explains how NCF improves traditional collaborative filtering methods by
|
||||
leveraging neural networks to model non-linear user-item interactions. The
|
||||
post outlines the implementation using the recommenders library, focusing on
|
||||
the use of implicit data (for example, user interactions like viewing or
|
||||
purchasing) and how it addresses challenges like the lack of negative values.
|
||||
|
||||
* The `Creating a PyTorch/TensorFlow code environment on AMD GPUs
|
||||
<https://rocm.blogs.amd.com/software-tools-optimization/pytorch-tensorflow-env/README.html>`__
|
||||
blog post provides instructions for creating a machine learning environment
|
||||
for PyTorch and TensorFlow on AMD GPUs using ROCm. It covers steps like
|
||||
installing the libraries, cloning code repositories, installing dependencies,
|
||||
and troubleshooting potential issues with CUDA-based code. Additionally, it
|
||||
explains how to HIPify code (port CUDA code to HIP) and manage Docker images
|
||||
for a better experience on AMD GPUs. This guide aims to help data scientists
|
||||
and ML practitioners adapt their code for AMD GPUs.
|
||||
|
||||
For more use cases and recommendations, see the `ROCm Tensorflow blog posts <https://rocm.blogs.amd.com/blog/tag/tensorflow.html>`__.
|
||||
@@ -1,86 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: verl compatibility
|
||||
:keywords: GPU, verl compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
*******************************************************************************
|
||||
verl compatibility
|
||||
*******************************************************************************
|
||||
|
||||
Volcano Engine Reinforcement Learning for LLMs (verl) is a reinforcement learning framework designed for large language models (LLMs).
|
||||
verl offers a scalable, open-source fine-tuning solution optimized for AMD Instinct GPUs with full ROCm support.
|
||||
|
||||
* See the `verl documentation <https://verl.readthedocs.io/en/latest/>`_ for more information about verl.
|
||||
* The official verl GitHub repository is `https://github.com/volcengine/verl <https://github.com/volcengine/verl>`_.
|
||||
* Use the AMD-validated :ref:`Docker images <verl-docker-compat>` with ROCm and verl preinstalled.
|
||||
* See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>` to install and get started.
|
||||
|
||||
.. note::
|
||||
|
||||
verl is supported on ROCm 6.2.0.
|
||||
|
||||
.. _verl-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
The benefits of verl in large-scale reinforcement learning from human feedback (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`_ blog.
|
||||
|
||||
.. _verl-supported_features:
|
||||
|
||||
Supported features
|
||||
===============================================================================
|
||||
|
||||
The following table shows verl on ROCm support for GPU-accelerated modules.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Module
|
||||
- Description
|
||||
- verl version
|
||||
- ROCm version
|
||||
* - ``FSDP``
|
||||
- Training engine
|
||||
- 0.3.0.post0
|
||||
- 6.2.0
|
||||
* - ``vllm``
|
||||
- Inference engine
|
||||
- 0.3.0.post0
|
||||
- 6.2.0
|
||||
|
||||
.. _verl-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes ready-made `ROCm verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and associated inventories represent the available verl versions from the official Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- verl
|
||||
- Ubuntu
|
||||
- Pytorch
|
||||
- Python
|
||||
- vllm
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/verl/verl-0.3.0.post0_rocm6.2_vllm0.6.3/images/sha256-cbe423803fd7850448b22444176bee06f4dcf22cd3c94c27732752d3a39b04b2"><i class="fab fa-docker fa-lg"></i> rocm/verl</a>
|
||||
- `6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`_
|
||||
- `0.3.0post0 <https://github.com/volcengine/verl/releases/tag/v0.3.0.post0>`_
|
||||
- 20.04
|
||||
- `2.5.0 <https://github.com/ROCm/pytorch/tree/release/2.5>`_
|
||||
- `3.9.19 <https://www.python.org/downloads/release/python-3919/>`_
|
||||
- `0.6.3 <https://github.com/vllm-project/vllm/releases/tag/v0.6.3>`_
|
||||
@@ -1,407 +0,0 @@
|
||||
.. meta::
|
||||
:description: Using CMake
|
||||
:keywords: CMake, dependencies, HIP, C++, AMD, ROCm
|
||||
|
||||
*********************************
|
||||
Using CMake
|
||||
*********************************
|
||||
|
||||
Most components in ROCm support CMake. Projects depending on header-only or
|
||||
library components typically require CMake 3.5 or higher whereas those wanting
|
||||
to make use of the CMake HIP language support will require CMake 3.21 or higher.
|
||||
|
||||
Finding dependencies
|
||||
====================
|
||||
|
||||
.. note::
|
||||
|
||||
For a complete
|
||||
reference on how to deal with dependencies in CMake, refer to the CMake docs
|
||||
on `find_package
|
||||
<https://cmake.org/cmake/help/latest/command/find_package.html>`_ and the
|
||||
`Using Dependencies Guide
|
||||
<https://cmake.org/cmake/help/latest/guide/using-dependencies/index.html>`_
|
||||
to get an overview of CMake related facilities.
|
||||
|
||||
In short, CMake supports finding dependencies in two ways:
|
||||
|
||||
* In Module mode, it consults a file ``Find<PackageName>.cmake`` which tries to find the component
|
||||
in typical install locations and layouts. CMake ships a few dozen such scripts, but users and projects
|
||||
may ship them as well.
|
||||
|
||||
* In Config mode, it locates a file named ``<packagename>-config.cmake`` or
|
||||
``<PackageName>Config.cmake`` which describes the installed component in all regards needed to
|
||||
consume it.
|
||||
|
||||
ROCm predominantly relies on Config mode, one notable exception being the Module
|
||||
driving the compilation of HIP programs on NVIDIA runtimes. As such, when
|
||||
dependencies are not found in standard system locations, one either has to
|
||||
instruct CMake to search for package config files in additional folders using
|
||||
the ``CMAKE_PREFIX_PATH`` variable (a semi-colon separated list of file system
|
||||
paths), or using ``<PackageName>_ROOT`` variable on a project-specific basis.
|
||||
|
||||
There are nearly a dozen ways to set these variables. One may be more convenient
|
||||
over the other depending on your workflow. Conceptually the simplest is adding
|
||||
it to your CMake configuration command on the command line via
|
||||
``-D CMAKE_PREFIX_PATH=....`` . AMD packaged ROCm installs can typically be
|
||||
added to the config file search paths such as:
|
||||
|
||||
* Windows: ``-D CMAKE_PREFIX_PATH=${env:HIP_PATH}``
|
||||
|
||||
* Linux: ``-D CMAKE_PREFIX_PATH=/opt/rocm``
|
||||
|
||||
ROCm provides the respective *config-file* packages, and this enables
|
||||
``find_package`` to be used directly. ROCm does not require any Find module as
|
||||
the *config-file* packages are shipped with the upstream projects, such as
|
||||
rocPRIM and other ROCm libraries.
|
||||
|
||||
For a complete guide on where and how ROCm may be installed on a system, refer
|
||||
to the installation guides for
|
||||
`Linux <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html>`_
|
||||
and
|
||||
`Windows <https://rocm.docs.amd.com/projects/install-on-windows/en/latest/index.html>`_.
|
||||
|
||||
Using HIP in CMake
|
||||
==================
|
||||
|
||||
ROCm components providing a C/C++ interface support consumption via any
|
||||
C/C++ toolchain that CMake knows how to drive. ROCm also supports the CMake HIP
|
||||
language features, allowing users to program using the HIP single-source
|
||||
programming model. When a program (or translation-unit) uses the HIP API without
|
||||
compiling any GPU device code, HIP can be treated in CMake as a simple C/C++
|
||||
library.
|
||||
|
||||
Using the HIP single-source programming model
|
||||
---------------------------------------------
|
||||
|
||||
Source code written in the HIP dialect of C++ typically uses the `.hip`
|
||||
extension. When the HIP CMake language is enabled, it will automatically
|
||||
associate such source files with the HIP toolchain being used.
|
||||
|
||||
.. code-block:: cmake
|
||||
|
||||
cmake_minimum_required(VERSION 3.21) # HIP language support requires 3.21
|
||||
cmake_policy(VERSION 3.21.3...3.27)
|
||||
project(MyProj LANGUAGES HIP)
|
||||
add_executable(MyApp Main.hip)
|
||||
|
||||
Should you have existing CUDA code that is from the source compatible subset of
|
||||
HIP, you can tell CMake that despite their `.cu` extension, they're HIP sources.
|
||||
Do note that this mostly facilitates compiling kernel code-only source files,
|
||||
as host-side CUDA API won't compile in this fashion.
|
||||
|
||||
.. code-block:: cmake
|
||||
|
||||
add_library(MyLib MyLib.cu)
|
||||
set_source_files_properties(MyLib.cu PROPERTIES LANGUAGE HIP)
|
||||
|
||||
CMake itself only hosts part of the HIP language support, such as defining
|
||||
HIP-specific properties, etc. while the other half ships with the HIP
|
||||
implementation, such as ROCm. CMake will search for a file
|
||||
`hip-lang-config.cmake` describing how the the properties defined by CMake
|
||||
translate to toolchain invocations. If one installs ROCm using non-standard
|
||||
methods or layouts and CMake can't locate this file or detect parts of the SDK,
|
||||
there's a catch-all, last resort variable consulted locating this file,
|
||||
``-D CMAKE_HIP_COMPILER_ROCM_ROOT:PATH=`` which should be set the root of the
|
||||
ROCm installation.
|
||||
|
||||
.. note::
|
||||
Imported targets defined by `hip-lang-config.cmake` are for internal use
|
||||
only.
|
||||
|
||||
If the user doesn't provide a semi-colon delimited list of device architectures
|
||||
via ``CMAKE_HIP_ARCHITECTURES``, CMake will select some sensible default. It is
|
||||
advised though that if a user knows what devices they wish to target, then set
|
||||
this variable explicitly.
|
||||
|
||||
Consuming ROCm C/C++ libraries
|
||||
------------------------------
|
||||
|
||||
Libraries such as rocBLAS, rocFFT, MIOpen, etc. behave as C/C++ libraries.
|
||||
Illustrated in the example below is a C++ application using MIOpen from CMake.
|
||||
It calls ``find_package(miopen)``, which provides the ``MIOpen`` imported
|
||||
target. This can be linked with ``target_link_libraries``
|
||||
|
||||
.. code-block:: cmake
|
||||
|
||||
cmake_minimum_required(VERSION 3.5) # find_package(miopen) requires 3.5
|
||||
cmake_policy(VERSION 3.5...3.27)
|
||||
project(MyProj LANGUAGES CXX)
|
||||
find_package(miopen)
|
||||
add_library(MyLib ...)
|
||||
target_link_libraries(MyLib PUBLIC MIOpen)
|
||||
|
||||
.. note::
|
||||
|
||||
Most libraries are designed as host-only API, so using a GPU device
|
||||
compiler is not necessary for downstream projects unless they use GPU device
|
||||
code.
|
||||
|
||||
Consuming the HIP API in C++ code
|
||||
---------------------------------
|
||||
|
||||
Consuming the HIP API without compiling single-source GPU device code can be
|
||||
done using any C++ compiler. The ``find_package(hip)`` provides the
|
||||
``hip::host`` imported target to use HIP in this scenario.
|
||||
|
||||
.. code-block:: cmake
|
||||
|
||||
cmake_minimum_required(VERSION 3.5) # find_package(hip) requires 3.5
|
||||
cmake_policy(VERSION 3.5...3.27)
|
||||
project(MyProj LANGUAGES CXX)
|
||||
find_package(hip REQUIRED)
|
||||
add_executable(MyApp ...)
|
||||
target_link_libraries(MyApp PRIVATE hip::host)
|
||||
|
||||
When mixing such ``CXX`` sources with ``HIP`` sources holding device-code, link
|
||||
only to `hip::host`. If HIP sources don't have `.hip` as their extension, use
|
||||
`set_source_files_properties(<hip_sources>... PROPERTIES LANGUAGE HIP)` on them.
|
||||
Linking to `hip::host` will set all the necessary flags for the ``CXX`` sources
|
||||
while ``HIP`` sources inherit all flags from the built-in language support.
|
||||
Having HIP sources in a target will turn the |LINK_LANG|_ into ``HIP``.
|
||||
|
||||
.. |LINK_LANG| replace:: ``LINKER_LANGUAGE``
|
||||
.. _LINK_LANG: https://cmake.org/cmake/help/latest/prop_tgt/LINKER_LANGUAGE.html
|
||||
|
||||
Compiling device code in C++ language mode
|
||||
------------------------------------------
|
||||
|
||||
.. attention::
|
||||
|
||||
The workflow detailed here is considered legacy and is shown for
|
||||
understanding's sake. It pre-dates the existence of HIP language support in
|
||||
CMake. If source code has HIP device code in it, it is a HIP source file
|
||||
and should be compiled as such. Only resort to the method below if your
|
||||
HIP-enabled CMake code path can't mandate CMake version 3.21.
|
||||
|
||||
If code uses the HIP API and compiles GPU device code, it requires using a
|
||||
device compiler. The compiler for CMake can be set using either the
|
||||
``CMAKE_C_COMPILER`` and ``CMAKE_CXX_COMPILER`` variable or using the ``CC``
|
||||
and ``CXX`` environment variables. This can be set when configuring CMake or
|
||||
put into a CMake toolchain file. The device compiler must be set to a
|
||||
compiler that supports AMD GPU targets, which is usually Clang.
|
||||
|
||||
The ``find_package(hip)`` provides the ``hip::device`` imported target to add
|
||||
all the flags necessary for device compilation.
|
||||
|
||||
.. code-block:: cmake
|
||||
|
||||
cmake_minimum_required(VERSION 3.8) # cxx_std_11 requires 3.8
|
||||
cmake_policy(VERSION 3.8...3.27)
|
||||
project(MyProj LANGUAGES CXX)
|
||||
find_package(hip REQUIRED)
|
||||
add_library(MyLib ...)
|
||||
target_link_libraries(MyLib PRIVATE hip::device)
|
||||
target_compile_features(MyLib PRIVATE cxx_std_11)
|
||||
|
||||
.. note::
|
||||
|
||||
Compiling for the GPU device requires at least C++11.
|
||||
|
||||
This project can then be configured with the following CMake commands:
|
||||
|
||||
* Windows: ``cmake -D CMAKE_CXX_COMPILER:PATH=${env:HIP_PATH}\bin\clang++.exe``
|
||||
* Linux: ``cmake -D CMAKE_CXX_COMPILER:PATH=/opt/rocm/bin/amdclang++``
|
||||
|
||||
Which use the device compiler provided from the binary packages of
|
||||
`ROCm HIP SDK <https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html>`_ and
|
||||
`repo.radeon.com <https://repo.radeon.com>`_ respectively.
|
||||
|
||||
When using the ``CXX`` language support to compile HIP device code, selecting the
|
||||
target GPU architectures is done via setting the ``GPU_TARGETS`` variable.
|
||||
``CMAKE_HIP_ARCHITECTURES`` only exists when the HIP language is enabled. By
|
||||
default, this is set to some subset of the currently supported architectures of
|
||||
AMD ROCm. It can be set to the CMake option ``-D GPU_TARGETS="gfx1032;gfx1035"``.
|
||||
|
||||
ROCm CMake packages
|
||||
-------------------
|
||||
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
| Component | Package | Targets |
|
||||
+===========+==========+========================================================+
|
||||
| HIP | hip | ``hip::host``, ``hip::device`` |
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
| rocPRIM | rocprim | ``roc::rocprim`` |
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
| rocThrust | rocthrust| ``roc::rocthrust`` |
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
| hipCUB | hipcub | ``hip::hipcub`` |
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
| rocRAND | rocrand | ``roc::rocrand`` |
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
| rocBLAS | rocblas | ``roc::rocblas`` |
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
| rocSOLVER | rocsolver| ``roc::rocsolver`` |
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
| hipBLAS | hipblas | ``roc::hipblas`` |
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
| rocFFT | rocfft | ``roc::rocfft`` |
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
| hipFFT | hipfft | ``hip::hipfft`` |
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
| rocSPARSE | rocsparse| ``roc::rocsparse`` |
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
| hipSPARSE | hipsparse| ``roc::hipsparse`` |
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
| rocALUTION|rocalution| ``roc::rocalution`` |
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
| RCCL | rccl | ``rccl`` |
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
| MIOpen | miopen | ``MIOpen`` |
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
| MIGraphX | migraphx | ``migraphx::migraphx``, ``migraphx::migraphx_c``, |
|
||||
| | | ``migraphx::migraphx_cpu``, ``migraphx::migraphx_gpu``,|
|
||||
| | | ``migraphx::migraphx_onnx``, ``migraphx::migraphx_tf`` |
|
||||
+-----------+----------+--------------------------------------------------------+
|
||||
|
||||
Using CMake presets
|
||||
===================
|
||||
|
||||
CMake command lines depending on how specific users like to be when compiling
|
||||
code can grow to unwieldy lengths. This is the primary reason why projects tend
|
||||
to bake script snippets into their build definitions controlling compiler
|
||||
warning levels, changing CMake defaults (``CMAKE_BUILD_TYPE`` or
|
||||
``BUILD_SHARED_LIBS`` just to name a few) and all sorts anti-patterns, all in
|
||||
the name of convenience.
|
||||
|
||||
Load on the command-line interface (CLI) starts immediately by selecting a
|
||||
toolchain, the set of utilities used to compile programs. To ease some of the
|
||||
toolchain related pains, CMake does consult the ``CC`` and ``CXX`` environmental
|
||||
variables when setting a default ``CMAKE_C[XX]_COMPILER`` respectively, but that
|
||||
is just the tip of the iceberg. There's a fair number of variables related to
|
||||
just the toolchain itself (typically supplied using
|
||||
`toolchain files <https://cmake.org/cmake/help/latest/manual/cmake-toolchains.7.html>`_
|
||||
), and then we still haven't talked about user preference or project-specific
|
||||
options.
|
||||
|
||||
IDEs supporting CMake (Visual Studio, Visual Studio Code, CLion, etc.) all came
|
||||
up with their own way to register command-line fragments of different purpose in
|
||||
a setup-and-forget fashion for quick assembly using graphical front-ends. This is
|
||||
all nice, but configurations aren't portable, nor can they be reused in
|
||||
Continuous Integration (CI) pipelines. CMake has condensed existing practice
|
||||
into a portable JSON format that works in all IDEs and can be invoked from any
|
||||
command line. This is
|
||||
`CMake Presets <https://cmake.org/cmake/help/latest/manual/cmake-presets.7.html>`_.
|
||||
|
||||
There are two types of preset files: one supplied by the project, called
|
||||
``CMakePresets.json`` which is meant to be committed to version control,
|
||||
typically used to drive CI; and one meant for the user to provide, called
|
||||
``CMakeUserPresets.json``, typically used to house user preference and adapting
|
||||
the build to the user's environment. These JSON files are allowed to include
|
||||
other JSON files and the user presets always implicitly includes the non-user
|
||||
variant.
|
||||
|
||||
Using HIP with presets
|
||||
----------------------
|
||||
|
||||
Following is an example ``CMakeUserPresets.json`` file which actually compiles
|
||||
the `amd/rocm-examples <https://github.com/amd/rocm-examples>`_ suite of sample
|
||||
applications on a typical ROCm installation:
|
||||
|
||||
.. code-block:: json
|
||||
|
||||
{
|
||||
"version": 3,
|
||||
"cmakeMinimumRequired": {
|
||||
"major": 3,
|
||||
"minor": 21,
|
||||
"patch": 0
|
||||
},
|
||||
"configurePresets": [
|
||||
{
|
||||
"name": "layout",
|
||||
"hidden": true,
|
||||
"binaryDir": "${sourceDir}/build/${presetName}",
|
||||
"installDir": "${sourceDir}/install/${presetName}"
|
||||
},
|
||||
{
|
||||
"name": "generator-ninja-multi-config",
|
||||
"hidden": true,
|
||||
"generator": "Ninja Multi-Config"
|
||||
},
|
||||
{
|
||||
"name": "toolchain-makefiles-c/c++-amdclang",
|
||||
"hidden": true,
|
||||
"cacheVariables": {
|
||||
"CMAKE_C_COMPILER": "/opt/rocm/bin/amdclang",
|
||||
"CMAKE_CXX_COMPILER": "/opt/rocm/bin/amdclang++",
|
||||
"CMAKE_HIP_COMPILER": "/opt/rocm/bin/amdclang++"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "clang-strict-iso-high-warn",
|
||||
"hidden": true,
|
||||
"cacheVariables": {
|
||||
"CMAKE_C_FLAGS": "-Wall -Wextra -pedantic",
|
||||
"CMAKE_CXX_FLAGS": "-Wall -Wextra -pedantic",
|
||||
"CMAKE_HIP_FLAGS": "-Wall -Wextra -pedantic"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "ninja-mc-rocm",
|
||||
"displayName": "Ninja Multi-Config ROCm",
|
||||
"inherits": [
|
||||
"layout",
|
||||
"generator-ninja-multi-config",
|
||||
"toolchain-makefiles-c/c++-amdclang",
|
||||
"clang-strict-iso-high-warn"
|
||||
]
|
||||
}
|
||||
],
|
||||
"buildPresets": [
|
||||
{
|
||||
"name": "ninja-mc-rocm-debug",
|
||||
"displayName": "Debug",
|
||||
"configuration": "Debug",
|
||||
"configurePreset": "ninja-mc-rocm"
|
||||
},
|
||||
{
|
||||
"name": "ninja-mc-rocm-release",
|
||||
"displayName": "Release",
|
||||
"configuration": "Release",
|
||||
"configurePreset": "ninja-mc-rocm"
|
||||
},
|
||||
{
|
||||
"name": "ninja-mc-rocm-debug-verbose",
|
||||
"displayName": "Debug (verbose)",
|
||||
"configuration": "Debug",
|
||||
"configurePreset": "ninja-mc-rocm",
|
||||
"verbose": true
|
||||
},
|
||||
{
|
||||
"name": "ninja-mc-rocm-release-verbose",
|
||||
"displayName": "Release (verbose)",
|
||||
"configuration": "Release",
|
||||
"configurePreset": "ninja-mc-rocm",
|
||||
"verbose": true
|
||||
}
|
||||
],
|
||||
"testPresets": [
|
||||
{
|
||||
"name": "ninja-mc-rocm-debug",
|
||||
"displayName": "Debug",
|
||||
"configuration": "Debug",
|
||||
"configurePreset": "ninja-mc-rocm",
|
||||
"execution": {
|
||||
"jobs": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "ninja-mc-rocm-release",
|
||||
"displayName": "Release",
|
||||
"configuration": "Release",
|
||||
"configurePreset": "ninja-mc-rocm",
|
||||
"execution": {
|
||||
"jobs": 0
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
.. note::
|
||||
|
||||
Getting presets to work reliably on Windows requires some CMake improvements
|
||||
and/or support from compiler vendors. (Refer to
|
||||
`Add support to the Visual Studio generators <https://gitlab.kitware.com/cmake/cmake/-/issues/24245>`_
|
||||
and `Sourcing environment scripts <https://gitlab.kitware.com/cmake/cmake/-/issues/21619>`_
|
||||
.)
|
||||
@@ -1,14 +0,0 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="description" content="AMD ROCm documentation">
|
||||
<meta name="keywords" content="documentation, guides, installation, compatibility, support,
|
||||
reference, ROCm, AMD">
|
||||
</head>
|
||||
|
||||
# Using compiler features
|
||||
|
||||
The following topics describe using specific features of the compilation tools:
|
||||
|
||||
* [ROCm compiler infrastructure](https://rocm.docs.amd.com/projects/llvm-project/en/latest/index.html)
|
||||
* [Using AddressSanitizer](https://rocm.docs.amd.com/projects/llvm-project/en/latest/conceptual/using-gpu-sanitizer.html)
|
||||
* [OpenMP support](https://rocm.docs.amd.com/projects/llvm-project/en/latest/conceptual/openmp.html)
|
||||
@@ -1,172 +0,0 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="description" content="ROCm Linux Filesystem Hierarchy Standard reorganization">
|
||||
<meta name="keywords" content="FHS, Linux Filesystem Hierarchy Standard, directory structure,
|
||||
AMD, ROCm">
|
||||
</head>
|
||||
|
||||
# ROCm Linux Filesystem Hierarchy Standard reorganization
|
||||
|
||||
## Introduction
|
||||
|
||||
The ROCm Software has adopted the Linux Filesystem Hierarchy Standard (FHS) [https://refspecs.linuxfoundation.org/FHS_3.0/fhs/index.html](https://refspecs.linuxfoundation.org/FHS_3.0/fhs/index.html) in order to to ensure ROCm is consistent with standard open source conventions. The following sections specify how current and future releases of ROCm adhere to FHS, how the previous ROCm file system is supported, and how improved versioning specifications are applied to ROCm.
|
||||
|
||||
## Adopting the FHS
|
||||
|
||||
In order to standardize ROCm directory structure and directory content layout ROCm has adopted the [FHS](https://refspecs.linuxfoundation.org/FHS_3.0/fhs/index.html), adhering to open source conventions for Linux-based distribution. FHS ensures internal consistency within the ROCm stack, as well as external consistency with other systems and distributions. The ROCm proposed file structure is outlined below:
|
||||
|
||||
```none
|
||||
/opt/rocm-<ver>
|
||||
| -- bin
|
||||
| -- all public binaries
|
||||
| -- lib
|
||||
| -- lib<soname>.so->lib<soname>.so.major->lib<soname>.so.major.minor.patch
|
||||
(public libaries to link with applications)
|
||||
| -- <component>
|
||||
| -- architecture dependent libraries and binaries used internally by components
|
||||
| -- cmake
|
||||
| -- <component>
|
||||
| --<component>-config.cmake
|
||||
| -- libexec
|
||||
| -- <component>
|
||||
| -- non ISA/architecture independent executables used internally by components
|
||||
| -- include
|
||||
| -- <component>
|
||||
| -- public header files
|
||||
| -- share
|
||||
| -- html
|
||||
| -- <component>
|
||||
| -- html documentation
|
||||
| -- info
|
||||
| -- <component>
|
||||
| -- info files
|
||||
| -- man
|
||||
| -- <component>
|
||||
| -- man pages
|
||||
| -- doc
|
||||
| -- <component>
|
||||
| -- license files
|
||||
| -- <component>
|
||||
| -- samples
|
||||
| -- architecture independent misc files
|
||||
```
|
||||
|
||||
## Changes from earlier ROCm versions
|
||||
|
||||
The following table provides a brief overview of the new ROCm FHS layout, compared to the layout of earlier ROCm versions. Note that /opt/ is used to denote the default rocm-installation-path and should be replaced in case of a non-standard installation location of the ROCm distribution.
|
||||
|
||||
```none
|
||||
______________________________________________________
|
||||
| New ROCm Layout | Previous ROCm Layout |
|
||||
|_____________________________|________________________|
|
||||
| /opt/rocm-<ver> | /opt/rocm-<ver> |
|
||||
| | -- bin | | -- bin |
|
||||
| | -- lib | | -- lib |
|
||||
| | -- cmake | | -- include |
|
||||
| | -- libexec | | -- <component_1> |
|
||||
| | -- include | | -- bin |
|
||||
| | -- <component_1> | | -- cmake |
|
||||
| | -- share | | -- doc |
|
||||
| | -- html | | -- lib |
|
||||
| | -- info | | -- include |
|
||||
| | -- man | | -- samples |
|
||||
| | -- doc | | -- <component_n> |
|
||||
| | -- <component_1> | | -- bin |
|
||||
| | -- samples | | -- cmake |
|
||||
| | -- .. | | -- doc |
|
||||
| | -- <component_n> | | -- lib |
|
||||
| | -- samples | | -- include |
|
||||
| | -- .. | | -- samples |
|
||||
|______________________________________________________|
|
||||
```
|
||||
|
||||
## ROCm FHS reorganization: backward compatibility
|
||||
|
||||
The FHS file organization for ROCm was first introduced in the release of ROCm 5.2 . Backward compatibility was implemented to make sure users could still run their ROCm applications while transitioning to the new FHS. ROCm has moved header files and libraries to their new locations as indicated in the above structure, and included symbolic-links and wrapper header files in their old location for backward compatibility. The following sections detail ROCm backward compatibility implementation for wrapper header files, executable files, library files and CMake config files.
|
||||
|
||||
### Wrapper header files
|
||||
|
||||
Wrapper header files are placed in the old location (
|
||||
`/opt/rocm-<ver>/<component>/include`) with a warning message to include files
|
||||
from the new location (`/opt/rocm-<ver>/include`) as shown in the example below.
|
||||
|
||||
```cpp
|
||||
#pragma message "This file is deprecated. Use file from include path /opt/rocm-ver/include/ and prefix with hip."
|
||||
#include <hip/hip_runtime.h>
|
||||
```
|
||||
|
||||
* Starting at ROCm 5.2 release, the deprecation for backward compatibility wrapper header files is: `#pragma` message announcing `#warning`.
|
||||
* Starting from ROCm 6.0 (tentatively) backward compatibility for wrapper header files will be removed, and the `#pragma` message will be announcing `#error`.
|
||||
|
||||
### Executable files
|
||||
|
||||
Executable files are available in the `/opt/rocm-<ver>/bin` folder. For backward
|
||||
compatibility, the old library location (`/opt/rocm-<ver>/<component>/bin`) has a
|
||||
soft link to the library at the new location. Soft links will be removed in a
|
||||
future release, tentatively ROCm v6.0.
|
||||
|
||||
```bash
|
||||
$ ls -l /opt/rocm/hip/bin/
|
||||
lrwxrwxrwx 1 root root 24 Jan 1 23:32 hipcc -> ../../bin/hipcc
|
||||
```
|
||||
|
||||
### Library files
|
||||
|
||||
Library files are available in the `/opt/rocm-<ver>/lib` folder. For backward
|
||||
compatibility, the old library location (`/opt/rocm-<ver>/<component>/lib`) has a
|
||||
soft link to the library at the new location. Soft links will be removed in a
|
||||
future release, tentatively ROCm v6.0.
|
||||
|
||||
```shell
|
||||
$ ls -l /opt/rocm/hip/lib/
|
||||
drwxr-xr-x 4 root root 4096 Jan 1 10:45 cmake
|
||||
lrwxrwxrwx 1 root root 24 Jan 1 23:32 libamdhip64.so -> ../../lib/libamdhip64.so
|
||||
```
|
||||
|
||||
### CMake config files
|
||||
|
||||
All CMake configuration files are available in the
|
||||
`/opt/rocm-<ver>/lib/cmake/<component>` folder. For backward compatibility, the
|
||||
old CMake locations (`/opt/rocm-<ver>/<component>/lib/cmake`) consist of a soft
|
||||
link to the new CMake config. Soft links will be removed in a future release,
|
||||
tentatively ROCm v6.0.
|
||||
|
||||
```shell
|
||||
$ ls -l /opt/rocm/hip/lib/cmake/hip/
|
||||
lrwxrwxrwx 1 root root 42 Jan 1 23:32 hip-config.cmake -> ../../../../lib/cmake/hip/hip-config.cmake
|
||||
```
|
||||
|
||||
## Changes required in applications using ROCm
|
||||
|
||||
Applications using ROCm are advised to use the new file paths. As the old files
|
||||
will be deprecated in a future release. Applications have to make sure to include
|
||||
correct header file and use correct search paths.
|
||||
|
||||
1. `#include<header_file.h>` needs to be changed to
|
||||
`#include <component/header_file.h>`
|
||||
|
||||
For example: `#include <hip.h>` needs to change
|
||||
to `#include <hip/hip.h>`
|
||||
|
||||
2. Any variable in CMake or Makefiles pointing to component folder needs to
|
||||
changed.
|
||||
|
||||
For example: `VAR1=/opt/rocm/hip` needs to be changed to `VAR1=/opt/rocm`
|
||||
`VAR2=/opt/rocm/hsa` needs to be changed to `VAR2=/opt/rocm`
|
||||
|
||||
3. Any reference to `/opt/rocm/<component>/bin` or `/opt/rocm/<component>/lib`
|
||||
needs to be changed to `/opt/rocm/bin` and `/opt/rocm/lib/`, respectively.
|
||||
|
||||
## Changes in versioning specifications
|
||||
|
||||
In order to better manage ROCm dependencies specification and allow smoother releases of ROCm while avoiding dependency conflicts, ROCm software shall adhere to the following scheme when numbering and incrementing ROCm files versions:
|
||||
|
||||
rocm-\<ver\>, where \<ver\> = \<x.y.z\>
|
||||
|
||||
x.y.z denote: MAJOR.MINOR.PATCH
|
||||
|
||||
z: PATCH - increment z when implementing backward compatible bug fixes.
|
||||
|
||||
y: MINOR - increment y when implementing minor changes that add functionality but are still backward compatible.
|
||||
|
||||
x: MAJOR - increment x when implementing major changes that are not backward compatible.
|
||||
@@ -1,73 +0,0 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="description" content="GPU architecture">
|
||||
<meta name="keywords" content="GPU architecture, architecture support, MI200, MI250, RDNA,
|
||||
MI100, AMD Instinct">
|
||||
</head>
|
||||
|
||||
(gpu-arch-documentation)=
|
||||
|
||||
# GPU architecture documentation
|
||||
|
||||
:::::{grid} 1 1 2 2
|
||||
:gutter: 1
|
||||
|
||||
:::{grid-item-card}
|
||||
**AMD Instinct MI300 series**
|
||||
|
||||
Review hardware aspects of the AMD Instinct™ MI300 series of GPU accelerators and the CDNA™ 3
|
||||
architecture.
|
||||
|
||||
* [AMD Instinct™ MI300 microarchitecture](./gpu-arch/mi300.md)
|
||||
* [AMD Instinct MI300/CDNA3 ISA](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf)
|
||||
* [White paper](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf)
|
||||
* [MI300 performance counters](./gpu-arch/mi300-mi200-performance-counters.rst)
|
||||
* [MI350 series performance counters](./gpu-arch/mi350-performance-counters.rst)
|
||||
:::
|
||||
|
||||
:::{grid-item-card}
|
||||
**AMD Instinct MI200 series**
|
||||
|
||||
Review hardware aspects of the AMD Instinct™ MI200 series of GPU accelerators and the CDNA™ 2
|
||||
architecture.
|
||||
|
||||
* [AMD Instinct™ MI250 microarchitecture](./gpu-arch/mi250.md)
|
||||
* [AMD Instinct MI200/CDNA2 ISA](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf)
|
||||
* [White paper](https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf)
|
||||
* [Performance counters](./gpu-arch/mi300-mi200-performance-counters.rst)
|
||||
|
||||
:::
|
||||
|
||||
:::{grid-item-card}
|
||||
**AMD Instinct MI100**
|
||||
|
||||
Review hardware aspects of the AMD Instinct™ MI100 series of GPU accelerators and the CDNA™ 1
|
||||
architecture.
|
||||
|
||||
* [AMD Instinct™ MI100 microarchitecture](./gpu-arch/mi100.md)
|
||||
* [AMD Instinct MI100/CDNA1 ISA](https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf)
|
||||
* [White paper](https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna-white-paper.pdf)
|
||||
|
||||
:::
|
||||
|
||||
:::{grid-item-card}
|
||||
**RDNA**
|
||||
|
||||
* [AMD RDNA4 ISA](https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna4-instruction-set-architecture.pdf)
|
||||
* [AMD RDNA3 ISA](https://www.amd.com/system/files/TechDocs/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf)
|
||||
* [AMD RDNA2 ISA](https://www.amd.com/system/files/TechDocs/rdna2-shader-instruction-set-architecture.pdf)
|
||||
* [AMD RDNA ISA](https://www.amd.com/system/files/TechDocs/rdna-shader-instruction-set-architecture.pdf)
|
||||
|
||||
:::
|
||||
|
||||
:::{grid-item-card}
|
||||
**Older architectures**
|
||||
|
||||
* [AMD Instinct MI50/Vega 7nm ISA](https://www.amd.com/system/files/TechDocs/vega-7nm-shader-instruction-set-architecture.pdf)
|
||||
* [AMD Instinct MI25/Vega ISA](https://www.amd.com/system/files/TechDocs/vega-shader-instruction-set-architecture.pdf)
|
||||
* [AMD GCN3 ISA](https://www.amd.com/system/files/TechDocs/gcn3-instruction-set-architecture.pdf)
|
||||
* [AMD Vega Architecture White Paper](https://en.wikichip.org/w/images/a/a1/vega-whitepaper.pdf)
|
||||
|
||||
:::
|
||||
|
||||
:::::
|
||||
@@ -1,95 +0,0 @@
|
||||
---
|
||||
myst:
|
||||
html_meta:
|
||||
"description lang=en": "Learn about the AMD Instinct MI100 series architecture."
|
||||
"keywords": "Instinct, MI100, microarchitecture, AMD, ROCm"
|
||||
---
|
||||
|
||||
# AMD Instinct™ MI100 microarchitecture
|
||||
|
||||
The following image shows the node-level architecture of a system that
|
||||
comprises two AMD EPYC™ processors and (up to) eight AMD Instinct™ accelerators.
|
||||
The two EPYC processors are connected to each other with the AMD Infinity™
|
||||
fabric which provides a high-bandwidth (up to 18 GT/sec) and coherent links such
|
||||
that each processor can access the available node memory as a single
|
||||
shared-memory domain in a non-uniform memory architecture (NUMA) fashion. In a
|
||||
2P, or dual-socket, configuration, three AMD Infinity™ fabric links are
|
||||
available to connect the processors plus one PCIe Gen 4 x16 link per processor
|
||||
can attach additional I/O devices such as the host adapters for the network
|
||||
fabric.
|
||||
|
||||

|
||||
|
||||
In a typical node configuration, each processor can host up to four AMD
|
||||
Instinct™ accelerators that are attached using PCIe Gen 4 links at 16 GT/sec,
|
||||
which corresponds to a peak bidirectional link bandwidth of 32 GB/sec. Each hive
|
||||
of four accelerators can participate in a fully connected, coherent AMD
|
||||
Instinct™ fabric that connects the four accelerators using 23 GT/sec AMD
|
||||
Infinity fabric links that run at a higher frequency than the inter-processor
|
||||
links. This inter-GPU link can be established in certified server systems if the
|
||||
GPUs are mounted in neighboring PCIe slots by installing the AMD Infinity
|
||||
Fabric™ bridge for the AMD Instinct™ accelerators.
|
||||
|
||||
## Microarchitecture
|
||||
|
||||
The microarchitecture of the AMD Instinct accelerators is based on the AMD CDNA
|
||||
architecture, which targets compute applications such as high-performance
|
||||
computing (HPC) and AI & machine learning (ML) that run on everything from
|
||||
individual servers to the world's largest exascale supercomputers. The overall
|
||||
system architecture is designed for extreme scalability and compute performance.
|
||||
|
||||
")
|
||||
|
||||
The above image shows the AMD Instinct accelerator with its PCIe Gen 4 x16
|
||||
link (16 GT/sec, at the bottom) that connects the GPU to (one of) the host
|
||||
processor(s). It also shows the three AMD Infinity Fabric ports that provide
|
||||
high-speed links (23 GT/sec, also at the bottom) to the other GPUs of the local
|
||||
hive.
|
||||
|
||||
On the left and right of the floor plan, the High Bandwidth Memory (HBM)
|
||||
attaches via the GPU memory controller. The MI100 generation of the AMD
|
||||
Instinct accelerator offers four stacks of HBM generation 2 (HBM2) for a total
|
||||
of 32GB with a 4,096bit-wide memory interface. The peak memory bandwidth of the
|
||||
attached HBM2 is 1.228 TB/sec at a memory clock frequency of 1.2 GHz.
|
||||
|
||||
The execution units of the GPU are depicted in the above image as Compute
|
||||
Units (CU). There are a total 120 compute units that are physically organized
|
||||
into eight Shader Engines (SE) with fifteen compute units per shader engine.
|
||||
Each compute unit is further sub-divided into four SIMD units that process SIMD
|
||||
instructions of 16 data elements per instruction. This enables the CU to process
|
||||
64 data elements (a so-called 'wavefront') at a peak clock frequency of 1.5 GHz.
|
||||
Therefore, the theoretical maximum FP64 peak performance is 11.5 TFLOPS
|
||||
(`4 [SIMD units] x 16 [elements per instruction] x 120 [CU] x 1.5 [GHz]`).
|
||||
|
||||

|
||||
|
||||
The preceding image shows the block diagram of a single CU of an AMD Instinct™
|
||||
MI100 accelerator and summarizes how instructions flow through the execution
|
||||
engines. The CU fetches the instructions via a 32KB instruction cache and moves
|
||||
them forward to execution via a dispatcher. The CU can handle up to ten
|
||||
wavefronts at a time and feed their instructions into the execution unit. The
|
||||
execution unit contains 256 vector general-purpose registers (VGPR) and 800
|
||||
scalar general-purpose registers (SGPR). The VGPR and SGPR are dynamically
|
||||
allocated to the executing wavefronts. A wavefront can access a maximum of 102
|
||||
scalar registers. Excess scalar-register usage will cause register spilling and
|
||||
thus may affect execution performance.
|
||||
|
||||
A wavefront can occupy any number of VGPRs from 0 to 256, directly affecting
|
||||
occupancy; that is, the number of concurrently active wavefronts in the CU. For
|
||||
instance, with 119 VGPRs used, only two wavefronts can be active in the CU at
|
||||
the same time. With the instruction latency of four cycles per SIMD instruction,
|
||||
the occupancy should be as high as possible such that the compute unit can
|
||||
improve execution efficiency by scheduling instructions from multiple
|
||||
wavefronts.
|
||||
|
||||
:::{table} Peak-performance capabilities of MI100 for different data types.
|
||||
:name: mi100-perf
|
||||
| Computation and Data Type | FLOPS/CLOCK/CU | Peak TFLOPS |
|
||||
| :------------------------ | :------------: | ----------: |
|
||||
| Vector FP64 | 64 | 11.5 |
|
||||
| Matrix FP32 | 256 | 46.1 |
|
||||
| Vector FP32 | 128 | 23.1 |
|
||||
| Matrix FP16 | 1024 | 184.6 |
|
||||
| Matrix BF16 | 512 | 92.3 |
|
||||
|
||||
:::
|
||||
@@ -1,134 +0,0 @@
|
||||
---
|
||||
myst:
|
||||
html_meta:
|
||||
"description lang=en": "Learn about the AMD Instinct MI250 series architecture."
|
||||
"keywords": "Instinct, MI250, microarchitecture, AMD, ROCm"
|
||||
---
|
||||
|
||||
# AMD Instinct™ MI250 microarchitecture
|
||||
|
||||
The microarchitecture of the AMD Instinct MI250 accelerators is based on the
|
||||
AMD CDNA 2 architecture that targets compute applications such as HPC,
|
||||
artificial intelligence (AI), and machine learning (ML) and that run on
|
||||
everything from individual servers to the world’s largest exascale
|
||||
supercomputers. The overall system architecture is designed for extreme
|
||||
scalability and compute performance.
|
||||
|
||||
The following image shows the components of a single Graphics Compute Die (GCD) of the CDNA 2 architecture. On the top and the bottom are AMD Infinity Fabric™
|
||||
interfaces and their physical links that are used to connect the GPU die to the
|
||||
other system-level components of the node (see also Section 2.2). Both
|
||||
interfaces can drive four AMD Infinity Fabric links. One of the AMD Infinity
|
||||
Fabric links of the controller at the bottom can be configured as a PCIe link.
|
||||
Each of the AMD Infinity Fabric links between GPUs can run at up to 25 GT/sec,
|
||||
which correlates to a peak transfer bandwidth of 50 GB/sec for a 16-wide link (
|
||||
two bytes per transaction). Section 2.2 has more details on the number of AMD
|
||||
Infinity Fabric links and the resulting transfer rates between the system-level
|
||||
components.
|
||||
|
||||
To the left and the right are memory controllers that attach the High Bandwidth
|
||||
Memory (HBM) modules to the GCD. AMD Instinct MI250 GPUs use HBM2e, which offers
|
||||
a peak memory bandwidth of 1.6 TB/sec per GCD.
|
||||
|
||||
The execution units of the GPU are depicted in the following image as Compute
|
||||
Units (CU). The MI250 GCD has 104 active CUs. Each compute unit is further
|
||||
subdivided into four SIMD units that process SIMD instructions of 16 data
|
||||
elements per instruction (for the FP64 data type). This enables the CU to
|
||||
process 64 work items (a so-called “wavefront”) at a peak clock frequency of 1.7
|
||||
GHz. Therefore, the theoretical maximum FP64 peak performance per GCD is 22.6
|
||||
TFLOPS for vector instructions. This equates to 45.3 TFLOPS for vector instructions for both GCDs together. The MI250 compute units also provide specialized
|
||||
execution units (also called matrix cores), which are geared toward executing
|
||||
matrix operations like matrix-matrix multiplications. For FP64, the peak
|
||||
performance of these units amounts to 90.5 TFLOPS.
|
||||
|
||||

|
||||
|
||||
```{list-table} Peak-performance capabilities of the MI250 OAM for different data types.
|
||||
:header-rows: 1
|
||||
:name: mi250-perf-table
|
||||
|
||||
*
|
||||
- Computation and Data Type
|
||||
- FLOPS/CLOCK/CU
|
||||
- Peak TFLOPS
|
||||
*
|
||||
- Matrix FP64
|
||||
- 256
|
||||
- 90.5
|
||||
*
|
||||
- Vector FP64
|
||||
- 128
|
||||
- 45.3
|
||||
*
|
||||
- Matrix FP32
|
||||
- 256
|
||||
- 90.5
|
||||
*
|
||||
- Packed FP32
|
||||
- 256
|
||||
- 90.5
|
||||
*
|
||||
- Vector FP32
|
||||
- 128
|
||||
- 45.3
|
||||
*
|
||||
- Matrix FP16
|
||||
- 1024
|
||||
- 362.1
|
||||
*
|
||||
- Matrix BF16
|
||||
- 1024
|
||||
- 362.1
|
||||
*
|
||||
- Matrix INT8
|
||||
- 1024
|
||||
- 362.1
|
||||
```
|
||||
|
||||
The above table summarizes the aggregated peak performance of the AMD
|
||||
Instinct MI250 OCP Open Accelerator Modules (OAM, OCP is short for Open Compute
|
||||
Platform) and its two GCDs for different data types and execution units. The
|
||||
middle column lists the peak performance (number of data elements processed in a
|
||||
single instruction) of a single compute unit if a SIMD (or matrix) instruction
|
||||
is being retired in each clock cycle. The third column lists the theoretical
|
||||
peak performance of the OAM module. The theoretical aggregated peak memory
|
||||
bandwidth of the GPU is 3.2 TB/sec (1.6 TB/sec per GCD).
|
||||
|
||||

|
||||
|
||||
The following image shows the block diagram of an OAM package that consists
|
||||
of two GCDs, each of which constitutes one GPU device in the system. The two
|
||||
GCDs in the package are connected via four AMD Infinity Fabric links running at
|
||||
a theoretical peak rate of 25 GT/sec, giving 200 GB/sec peak transfer bandwidth
|
||||
between the two GCDs of an OAM, or a bidirectional peak transfer bandwidth of
|
||||
400 GB/sec for the same.
|
||||
|
||||
## Node-level architecture
|
||||
|
||||
The following image shows the node-level architecture of a system that is
|
||||
based on the AMD Instinct MI250 accelerator. The MI250 OAMs attach to the host
|
||||
system via PCIe Gen 4 x16 links (yellow lines). Each GCD maintains its own PCIe
|
||||
x16 link to the host part of the system. Depending on the server platform, the
|
||||
GCD can attach to the AMD EPYC processor directly or via an optional PCIe switch
|
||||
. Note that some platforms may offer an x8 interface to the GCDs, which reduces
|
||||
the available host-to-GPU bandwidth.
|
||||
|
||||

|
||||
|
||||
The preceding image shows the node-level architecture of a system with AMD
|
||||
EPYC processors in a dual-socket configuration and four AMD Instinct MI250
|
||||
accelerators. The MI250 OAMs attach to the host processors system via PCIe Gen 4
|
||||
x16 links (yellow lines). Depending on the system design, a PCIe switch may
|
||||
exist to make more PCIe lanes available for additional components like network
|
||||
interfaces and/or storage devices. Each GCD maintains its own PCIe x16 link to
|
||||
the host part of the system or to the PCIe switch. Please note, some platforms
|
||||
may offer an x8 interface to the GCDs, which will reduce the available
|
||||
host-to-GPU bandwidth.
|
||||
|
||||
Between the OAMs and their respective GCDs, a peer-to-peer (P2P) network allows
|
||||
for direct data exchange between the GPU dies via AMD Infinity Fabric links (
|
||||
black, green, and red lines). Each of these 16-wide links connects to one of the
|
||||
two GPU dies in the MI250 OAM and operates at 25 GT/sec, which corresponds to a
|
||||
theoretical peak transfer rate of 50 GB/sec per link (or 100 GB/sec
|
||||
bidirectional peak transfer bandwidth). The GCD pairs 2 and 6 as well as GCDs 0
|
||||
and 4 connect via two XGMI links, which is indicated by the thicker red line in
|
||||
the preceding image.
|
||||
@@ -1,757 +0,0 @@
|
||||
.. meta::
|
||||
:description: MI300 and MI200 series performance counters and metrics
|
||||
:keywords: MI300, MI200, performance counters, command processor counters
|
||||
|
||||
***************************************************************************************************
|
||||
MI300 and MI200 series performance counters and metrics
|
||||
***************************************************************************************************
|
||||
|
||||
This document lists and describes the hardware performance counters and derived metrics available
|
||||
for the AMD Instinct™ MI300 and MI200 GPU. You can also access this information using the
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:how-to/using-rocprofv3>`.
|
||||
|
||||
MI300 and MI200 series performance counters
|
||||
===============================================================
|
||||
|
||||
Series performance counters include the following categories:
|
||||
|
||||
* :ref:`command-processor-counters`
|
||||
* :ref:`graphics-register-bus-manager-counters`
|
||||
* :ref:`spi-counters`
|
||||
* :ref:`compute-unit-counters`
|
||||
* :ref:`l1i-and-sl1d-cache-counters`
|
||||
* :ref:`vector-l1-cache-subsystem-counters`
|
||||
* :ref:`l2-cache-access-counters`
|
||||
|
||||
The following sections provide additional details for each category.
|
||||
|
||||
.. note::
|
||||
|
||||
Preliminary validation of all MI300 and MI200 series performance counters is in progress. Those with
|
||||
an asterisk (*) require further evaluation.
|
||||
|
||||
.. _command-processor-counters:
|
||||
|
||||
Command processor counters
|
||||
---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
Command processor counters are further classified into command processor-fetcher and command
|
||||
processor-compute.
|
||||
|
||||
Command processor-fetcher counters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition"
|
||||
|
||||
"``CPF_CMP_UTCL1_STALL_ON_TRANSLATION``", "Cycles", "Number of cycles one of the compute unified translation caches (L1) is stalled waiting on translation"
|
||||
"``CPF_CPF_STAT_BUSY``", "Cycles", "Number of cycles command processor-fetcher is busy"
|
||||
"``CPF_CPF_STAT_IDLE``", "Cycles", "Number of cycles command processor-fetcher is idle"
|
||||
"``CPF_CPF_STAT_STALL``", "Cycles", "Number of cycles command processor-fetcher is stalled"
|
||||
"``CPF_CPF_TCIU_BUSY``", "Cycles", "Number of cycles command processor-fetcher texture cache interface unit interface is busy"
|
||||
"``CPF_CPF_TCIU_IDLE``", "Cycles", "Number of cycles command processor-fetcher texture cache interface unit interface is idle"
|
||||
"``CPF_CPF_TCIU_STALL``", "Cycles", "Number of cycles command processor-fetcher texture cache interface unit interface is stalled waiting on free tags"
|
||||
|
||||
The texture cache interface unit is the interface between the command processor and the memory
|
||||
system.
|
||||
|
||||
Command processor-compute counters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition"
|
||||
|
||||
"``CPC_ME1_BUSY_FOR_PACKET_DECODE``", "Cycles", "Number of cycles command processor-compute micro engine is busy decoding packets"
|
||||
"``CPC_UTCL1_STALL_ON_TRANSLATION``", "Cycles", "Number of cycles one of the unified translation caches (L1) is stalled waiting on translation"
|
||||
"``CPC_CPC_STAT_BUSY``", "Cycles", "Number of cycles command processor-compute is busy"
|
||||
"``CPC_CPC_STAT_IDLE``", "Cycles", "Number of cycles command processor-compute is idle"
|
||||
"``CPC_CPC_STAT_STALL``", "Cycles", "Number of cycles command processor-compute is stalled"
|
||||
"``CPC_CPC_TCIU_BUSY``", "Cycles", "Number of cycles command processor-compute texture cache interface unit interface is busy"
|
||||
"``CPC_CPC_TCIU_IDLE``", "Cycles", "Number of cycles command processor-compute texture cache interface unit interface is idle"
|
||||
"``CPC_CPC_UTCL2IU_BUSY``", "Cycles", "Number of cycles command processor-compute unified translation cache (L2) interface is busy"
|
||||
"``CPC_CPC_UTCL2IU_IDLE``", "Cycles", "Number of cycles command processor-compute unified translation cache (L2) interface is idle"
|
||||
"``CPC_CPC_UTCL2IU_STALL``", "Cycles", "Number of cycles command processor-compute unified translation cache (L2) interface is stalled"
|
||||
"``CPC_ME1_DC0_SPI_BUSY``", "Cycles", "Number of cycles command processor-compute micro engine processor is busy"
|
||||
|
||||
The micro engine runs packet-processing firmware on the command processor-compute counter.
|
||||
|
||||
.. _graphics-register-bus-manager-counters:
|
||||
|
||||
Graphics register bus manager counters
|
||||
---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition"
|
||||
|
||||
"``GRBM_COUNT``", "Cycles","Number of free-running GPU cycles"
|
||||
"``GRBM_GUI_ACTIVE``", "Cycles", "Number of GPU active cycles"
|
||||
"``GRBM_CP_BUSY``", "Cycles", "Number of cycles any of the command processor blocks are busy"
|
||||
"``GRBM_SPI_BUSY``", "Cycles", "Number of cycles any of the shader processor input is busy in the shader engines"
|
||||
"``GRBM_TA_BUSY``", "Cycles", "Number of cycles any of the texture addressing unit is busy in the shader engines"
|
||||
"``GRBM_TC_BUSY``", "Cycles", "Number of cycles any of the texture cache blocks are busy"
|
||||
"``GRBM_CPC_BUSY``", "Cycles", "Number of cycles the command processor-compute is busy"
|
||||
"``GRBM_CPF_BUSY``", "Cycles", "Number of cycles the command processor-fetcher is busy"
|
||||
"``GRBM_UTCL2_BUSY``", "Cycles", "Number of cycles the unified translation cache (Level 2 [L2]) block is busy"
|
||||
"``GRBM_EA_BUSY``", "Cycles", "Number of cycles the efficiency arbiter block is busy"
|
||||
|
||||
Texture cache blocks include:
|
||||
|
||||
* Texture cache arbiter
|
||||
* Texture cache per pipe, also known as vector Level 1 (L1) cache
|
||||
* Texture cache per channel, also known as known as L2 cache
|
||||
* Texture cache interface
|
||||
|
||||
.. _spi-counters:
|
||||
|
||||
Shader processor input counters
|
||||
---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition"
|
||||
|
||||
"``SPI_CSN_BUSY``", "Cycles", "Number of cycles with outstanding waves"
|
||||
"``SPI_CSN_WINDOW_VALID``", "Cycles", "Number of cycles enabled by ``perfcounter_start`` event"
|
||||
"``SPI_CSN_NUM_THREADGROUPS``", "Workgroups", "Number of dispatched workgroups"
|
||||
"``SPI_CSN_WAVE``", "Wavefronts", "Number of dispatched wavefronts"
|
||||
"``SPI_RA_REQ_NO_ALLOC``", "Cycles", "Number of arbiter cycles with requests but no allocation"
|
||||
"``SPI_RA_REQ_NO_ALLOC_CSN``", "Cycles", "Number of arbiter cycles with compute shader (n\ :sup:`th` pipe) requests but no compute shader (n\ :sup:`th` pipe) allocation"
|
||||
"``SPI_RA_RES_STALL_CSN``", "Cycles", "Number of arbiter stall cycles due to shortage of compute shader (n\ :sup:`th` pipe) pipeline slots"
|
||||
"``SPI_RA_TMP_STALL_CSN``", "Cycles", "Number of stall cycles due to shortage of temp space"
|
||||
"``SPI_RA_WAVE_SIMD_FULL_CSN``", "SIMD-cycles", "Accumulated number of single instruction, multiple data (SIMD) per cycle affected by shortage of wave slots for compute shader (n\ :sup:`th` pipe) wave dispatch"
|
||||
"``SPI_RA_VGPR_SIMD_FULL_CSN``", "SIMD-cycles", "Accumulated number of SIMDs per cycle affected by shortage of vector general-purpose register (VGPR) slots for compute shader (n\ :sup:`th` pipe) wave dispatch"
|
||||
"``SPI_RA_SGPR_SIMD_FULL_CSN``", "SIMD-cycles", "Accumulated number of SIMDs per cycle affected by shortage of scalar general-purpose register (SGPR) slots for compute shader (n\ :sup:`th` pipe) wave dispatch"
|
||||
"``SPI_RA_LDS_CU_FULL_CSN``", "CU", "Number of compute units affected by shortage of local data share (LDS) space for compute shader (n\ :sup:`th` pipe) wave dispatch"
|
||||
"``SPI_RA_BAR_CU_FULL_CSN``", "CU", "Number of compute units with compute shader (n\ :sup:`th` pipe) waves waiting at a BARRIER"
|
||||
"``SPI_RA_BULKY_CU_FULL_CSN``", "CU", "Number of compute units with compute shader (n\ :sup:`th` pipe) waves waiting for BULKY resource"
|
||||
"``SPI_RA_TGLIM_CU_FULL_CSN``", "Cycles", "Number of compute shader (n\ :sup:`th` pipe) wave stall cycles due to restriction of ``tg_limit`` for thread group size"
|
||||
"``SPI_RA_WVLIM_STALL_CSN``", "Cycles", "Number of cycles compute shader (n\ :sup:`th` pipe) is stalled due to ``WAVE_LIMIT``"
|
||||
"``SPI_VWC_CSC_WR``", "Qcycles", "Number of quad-cycles taken to initialize VGPRs when launching waves"
|
||||
"``SPI_SWC_CSC_WR``", "Qcycles", "Number of quad-cycles taken to initialize SGPRs when launching waves"
|
||||
|
||||
.. _compute-unit-counters:
|
||||
|
||||
Compute unit counters
|
||||
---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
The compute unit counters are further classified into instruction mix, matrix fused multiply-add (FMA)
|
||||
operation counters, level counters, wavefront counters, wavefront cycle counters, and LDS counters.
|
||||
|
||||
Instruction mix
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition"
|
||||
|
||||
"``SQ_INSTS``", "Instr", "Number of instructions issued"
|
||||
"``SQ_INSTS_VALU``", "Instr", "Number of vector arithmetic logic unit (VALU) instructions including matrix FMA issued"
|
||||
"``SQ_INSTS_VALU_ADD_F16``", "Instr", "Number of VALU half-precision floating-point (F16) ``ADD`` or ``SUB`` instructions issued"
|
||||
"``SQ_INSTS_VALU_MUL_F16``", "Instr", "Number of VALU F16 Multiply instructions issued"
|
||||
"``SQ_INSTS_VALU_FMA_F16``", "Instr", "Number of VALU F16 FMA or multiply-add instructions issued"
|
||||
"``SQ_INSTS_VALU_TRANS_F16``", "Instr", "Number of VALU F16 Transcendental instructions issued"
|
||||
"``SQ_INSTS_VALU_ADD_F32``", "Instr", "Number of VALU full-precision floating-point (F32) ``ADD`` or ``SUB`` instructions issued"
|
||||
"``SQ_INSTS_VALU_MUL_F32``", "Instr", "Number of VALU F32 Multiply instructions issued"
|
||||
"``SQ_INSTS_VALU_FMA_F32``", "Instr", "Number of VALU F32 FMAor multiply-add instructions issued"
|
||||
"``SQ_INSTS_VALU_TRANS_F32``", "Instr", "Number of VALU F32 Transcendental instructions issued"
|
||||
"``SQ_INSTS_VALU_ADD_F64``", "Instr", "Number of VALU F64 ``ADD`` or ``SUB`` instructions issued"
|
||||
"``SQ_INSTS_VALU_MUL_F64``", "Instr", "Number of VALU F64 Multiply instructions issued"
|
||||
"``SQ_INSTS_VALU_FMA_F64``", "Instr", "Number of VALU F64 FMA or multiply-add instructions issued"
|
||||
"``SQ_INSTS_VALU_TRANS_F64``", "Instr", "Number of VALU F64 Transcendental instructions issued"
|
||||
"``SQ_INSTS_VALU_INT32``", "Instr", "Number of VALU 32-bit integer instructions (signed or unsigned) issued"
|
||||
"``SQ_INSTS_VALU_INT64``", "Instr", "Number of VALU 64-bit integer instructions (signed or unsigned) issued"
|
||||
"``SQ_INSTS_VALU_CVT``", "Instr", "Number of VALU Conversion instructions issued"
|
||||
"``SQ_INSTS_VALU_MFMA_I8``", "Instr", "Number of 8-bit Integer matrix FMA instructions issued"
|
||||
"``SQ_INSTS_VALU_MFMA_F16``", "Instr", "Number of F16 matrix FMA instructions issued"
|
||||
"``SQ_INSTS_VALU_MFMA_F32``", "Instr", "Number of F32 matrix FMA instructions issued"
|
||||
"``SQ_INSTS_VALU_MFMA_F64``", "Instr", "Number of F64 matrix FMA instructions issued"
|
||||
"``SQ_INSTS_MFMA``", "Instr", "Number of matrix FMA instructions issued"
|
||||
"``SQ_INSTS_VMEM_WR``", "Instr", "Number of vector memory write instructions (including flat) issued"
|
||||
"``SQ_INSTS_VMEM_RD``", "Instr", "Number of vector memory read instructions (including flat) issued"
|
||||
"``SQ_INSTS_VMEM``", "Instr", "Number of vector memory instructions issued, including both flat and buffer instructions"
|
||||
"``SQ_INSTS_SALU``", "Instr", "Number of scalar arithmetic logic unit (SALU) instructions issued"
|
||||
"``SQ_INSTS_SMEM``", "Instr", "Number of scalar memory instructions issued"
|
||||
"``SQ_INSTS_SMEM_NORM``", "Instr", "Number of scalar memory instructions normalized to match ``smem_level`` issued"
|
||||
"``SQ_INSTS_FLAT``", "Instr", "Number of flat instructions issued"
|
||||
"``SQ_INSTS_FLAT_LDS_ONLY``", "Instr", "**MI200 series only** Number of FLAT instructions that read/write only from/to LDS issued. Works only if ``EARLY_TA_DONE`` is enabled."
|
||||
"``SQ_INSTS_LDS``", "Instr", "Number of LDS instructions issued **(MI200: includes flat; MI300: does not include flat)**"
|
||||
"``SQ_INSTS_GDS``", "Instr", "Number of global data share instructions issued"
|
||||
"``SQ_INSTS_EXP_GDS``", "Instr", "Number of EXP and global data share instructions excluding skipped export instructions issued"
|
||||
"``SQ_INSTS_BRANCH``", "Instr", "Number of branch instructions issued"
|
||||
"``SQ_INSTS_SENDMSG``", "Instr", "Number of ``SENDMSG`` instructions including ``s_endpgm`` issued"
|
||||
"``SQ_INSTS_VSKIPPED``", "Instr", "Number of vector instructions skipped"
|
||||
|
||||
Flat instructions allow read, write, and atomic access to a generic memory address pointer that can
|
||||
resolve to any of the following physical memories:
|
||||
|
||||
* Global Memory
|
||||
* Scratch ("private")
|
||||
* LDS ("shared")
|
||||
* Invalid - ``MEM_VIOL`` TrapStatus
|
||||
|
||||
Matrix fused multiply-add operation counters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition"
|
||||
|
||||
"``SQ_INSTS_VALU_MFMA_MOPS_I8``", "IOP", "Number of 8-bit integer matrix FMA ops in the unit of 512"
|
||||
"``SQ_INSTS_VALU_MFMA_MOPS_F16``", "FLOP", "Number of F16 floating matrix FMA ops in the unit of 512"
|
||||
"``SQ_INSTS_VALU_MFMA_MOPS_BF16``", "FLOP", "Number of BF16 floating matrix FMA ops in the unit of 512"
|
||||
"``SQ_INSTS_VALU_MFMA_MOPS_F32``", "FLOP", "Number of F32 floating matrix FMA ops in the unit of 512"
|
||||
"``SQ_INSTS_VALU_MFMA_MOPS_F64``", "FLOP", "Number of F64 floating matrix FMA ops in the unit of 512"
|
||||
|
||||
Level counters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. note::
|
||||
|
||||
All level counters must be followed by ``SQ_ACCUM_PREV_HIRES`` counter to measure average latency.
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition"
|
||||
|
||||
"``SQ_ACCUM_PREV``", "Count", "Accumulated counter sample value where accumulation takes place once every four cycles"
|
||||
"``SQ_ACCUM_PREV_HIRES``", "Count", "Accumulated counter sample value where accumulation takes place once every cycle"
|
||||
"``SQ_LEVEL_WAVES``", "Waves", "Number of inflight waves"
|
||||
"``SQ_INST_LEVEL_VMEM``", "Instr", "Number of inflight vector memory (including flat) instructions"
|
||||
"``SQ_INST_LEVEL_SMEM``", "Instr", "Number of inflight scalar memory instructions"
|
||||
"``SQ_INST_LEVEL_LDS``", "Instr", "Number of inflight LDS (including flat) instructions"
|
||||
"``SQ_IFETCH_LEVEL``", "Instr", "Number of inflight instruction fetch requests from the cache"
|
||||
|
||||
Use the following formulae to calculate latencies:
|
||||
|
||||
* Vector memory latency = ``SQ_ACCUM_PREV_HIRES`` divided by ``SQ_INSTS_VMEM``
|
||||
* Wave latency = ``SQ_ACCUM_PREV_HIRES`` divided by ``SQ_WAVE``
|
||||
* LDS latency = ``SQ_ACCUM_PREV_HIRES`` divided by ``SQ_INSTS_LDS``
|
||||
* Scalar memory latency = ``SQ_ACCUM_PREV_HIRES`` divided by ``SQ_INSTS_SMEM_NORM``
|
||||
* Instruction fetch latency = ``SQ_ACCUM_PREV_HIRES`` divided by ``SQ_IFETCH``
|
||||
|
||||
Wavefront counters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition"
|
||||
|
||||
"``SQ_WAVES``", "Waves", "Number of wavefronts dispatched to sequencers, including both new and restored wavefronts"
|
||||
"``SQ_WAVES_SAVED``", "Waves", "Number of context-saved waves"
|
||||
"``SQ_WAVES_RESTORED``", "Waves", "Number of context-restored waves sent to sequencers"
|
||||
"``SQ_WAVES_EQ_64``", "Waves", "Number of wavefronts with exactly 64 active threads sent to sequencers"
|
||||
"``SQ_WAVES_LT_64``", "Waves", "Number of wavefronts with less than 64 active threads sent to sequencers"
|
||||
"``SQ_WAVES_LT_48``", "Waves", "Number of wavefronts with less than 48 active threads sent to sequencers"
|
||||
"``SQ_WAVES_LT_32``", "Waves", "Number of wavefronts with less than 32 active threads sent to sequencers"
|
||||
"``SQ_WAVES_LT_16``", "Waves", "Number of wavefronts with less than 16 active threads sent to sequencers"
|
||||
|
||||
Wavefront cycle counters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition"
|
||||
|
||||
"``SQ_CYCLES``", "Cycles", "Clock cycles"
|
||||
"``SQ_BUSY_CYCLES``", "Cycles", "Number of cycles while sequencers reports it to be busy"
|
||||
"``SQ_BUSY_CU_CYCLES``", "Qcycles", "Number of quad-cycles each compute unit is busy"
|
||||
"``SQ_VALU_MFMA_BUSY_CYCLES``", "Cycles", "Number of cycles the matrix FMA arithmetic logic unit (ALU) is busy"
|
||||
"``SQ_WAVE_CYCLES``", "Qcycles", "Number of quad-cycles spent by waves in the compute units"
|
||||
"``SQ_WAIT_ANY``", "Qcycles", "Number of quad-cycles spent waiting for anything"
|
||||
"``SQ_WAIT_INST_ANY``", "Qcycles", "Number of quad-cycles spent waiting for any instruction to be issued"
|
||||
"``SQ_ACTIVE_INST_ANY``", "Qcycles", "Number of quad-cycles spent by each wave to work on an instruction"
|
||||
"``SQ_ACTIVE_INST_VMEM``", "Qcycles", "Number of quad-cycles spent by the sequencer instruction arbiter to work on a vector memory instruction"
|
||||
"``SQ_ACTIVE_INST_LDS``", "Qcycles", "Number of quad-cycles spent by the sequencer instruction arbiter to work on an LDS instruction"
|
||||
"``SQ_ACTIVE_INST_VALU``", "Qcycles", "Number of quad-cycles spent by the sequencer instruction arbiter to work on a VALU instruction"
|
||||
"``SQ_ACTIVE_INST_SCA``", "Qcycles", "Number of quad-cycles spent by the sequencer instruction arbiter to work on a SALU or scalar memory instruction"
|
||||
"``SQ_ACTIVE_INST_EXP_GDS``", "Qcycles", "Number of quad-cycles spent by the sequencer instruction arbiter to work on an ``EXPORT`` or ``GDS`` instruction"
|
||||
"``SQ_ACTIVE_INST_MISC``", "Qcycles", "Number of quad-cycles spent by the sequencer instruction arbiter to work on a ``BRANCH`` or ``SENDMSG`` instruction"
|
||||
"``SQ_ACTIVE_INST_FLAT``", "Qcycles", "Number of quad-cycles spent by the sequencer instruction arbiter to work on a flat instruction"
|
||||
"``SQ_INST_CYCLES_VMEM_WR``", "Qcycles", "Number of quad-cycles spent to send addr and cmd data for vector memory write instructions"
|
||||
"``SQ_INST_CYCLES_VMEM_RD``", "Qcycles", "Number of quad-cycles spent to send addr and cmd data for vector memory read instructions"
|
||||
"``SQ_INST_CYCLES_SMEM``", "Qcycles", "Number of quad-cycles spent to execute scalar memory reads"
|
||||
"``SQ_INST_CYCLES_SALU``", "Qcycles", "Number of quad-cycles spent to execute non-memory read scalar operations"
|
||||
"``SQ_THREAD_CYCLES_VALU``", "Qcycles", "Number of quad-cycles spent to execute VALU operations on active threads"
|
||||
"``SQ_WAIT_INST_LDS``", "Qcycles", "Number of quad-cycles spent waiting for LDS instruction to be issued"
|
||||
|
||||
``SQ_THREAD_CYCLES_VALU`` is similar to ``INST_CYCLES_VALU``, but it's multiplied by the number of
|
||||
active threads.
|
||||
|
||||
LDS counters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition"
|
||||
|
||||
"``SQ_LDS_ATOMIC_RETURN``", "Cycles", "Number of atomic return cycles in LDS"
|
||||
"``SQ_LDS_BANK_CONFLICT``", "Cycles", "Number of cycles LDS is stalled by bank conflicts"
|
||||
"``SQ_LDS_ADDR_CONFLICT``", "Cycles", "Number of cycles LDS is stalled by address conflicts"
|
||||
"``SQ_LDS_UNALIGNED_STALL``", "Cycles", "Number of cycles LDS is stalled processing flat unaligned load or store operations"
|
||||
"``SQ_LDS_MEM_VIOLATIONS``", "Count", "Number of threads that have a memory violation in the LDS"
|
||||
"``SQ_LDS_IDX_ACTIVE``", "Cycles", "Number of cycles LDS is used for indexed operations"
|
||||
|
||||
Miscellaneous counters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition"
|
||||
|
||||
"``SQ_IFETCH``", "Count", "Number of instruction fetch requests from L1i, in 32-byte width"
|
||||
"``SQ_ITEMS``", "Threads", "Number of valid items per wave"
|
||||
|
||||
.. _l1i-and-sl1d-cache-counters:
|
||||
|
||||
L1 instruction cache (L1i) and scalar L1 data cache (L1d) counters
|
||||
---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition"
|
||||
|
||||
"``SQC_ICACHE_REQ``", "Req", "Number of L1 instruction (L1i) cache requests"
|
||||
"``SQC_ICACHE_HITS``", "Count", "Number of L1i cache hits"
|
||||
"``SQC_ICACHE_MISSES``", "Count", "Number of non-duplicate L1i cache misses including uncached requests"
|
||||
"``SQC_ICACHE_MISSES_DUPLICATE``", "Count", "Number of duplicate L1i cache misses whose previous lookup miss on the same cache line is not fulfilled yet"
|
||||
"``SQC_DCACHE_REQ``", "Req", "Number of scalar L1d requests"
|
||||
"``SQC_DCACHE_INPUT_VALID_READYB``", "Cycles", "Number of cycles while sequencer input is valid but scalar L1d is not ready"
|
||||
"``SQC_DCACHE_HITS``", "Count", "Number of scalar L1d hits"
|
||||
"``SQC_DCACHE_MISSES``", "Count", "Number of non-duplicate scalar L1d misses including uncached requests"
|
||||
"``SQC_DCACHE_MISSES_DUPLICATE``", "Count", "Number of duplicate scalar L1d misses"
|
||||
"``SQC_DCACHE_REQ_READ_1``", "Req", "Number of constant cache read requests in a single 32-bit data word"
|
||||
"``SQC_DCACHE_REQ_READ_2``", "Req", "Number of constant cache read requests in two 32-bit data words"
|
||||
"``SQC_DCACHE_REQ_READ_4``", "Req", "Number of constant cache read requests in four 32-bit data words"
|
||||
"``SQC_DCACHE_REQ_READ_8``", "Req", "Number of constant cache read requests in eight 32-bit data words"
|
||||
"``SQC_DCACHE_REQ_READ_16``", "Req", "Number of constant cache read requests in 16 32-bit data words"
|
||||
"``SQC_DCACHE_ATOMIC``", "Req", "Number of atomic requests"
|
||||
"``SQC_TC_REQ``", "Req", "Number of texture cache requests that were issued by instruction and constant caches"
|
||||
"``SQC_TC_INST_REQ``", "Req", "Number of instruction requests to the L2 cache"
|
||||
"``SQC_TC_DATA_READ_REQ``", "Req", "Number of data Read requests to the L2 cache"
|
||||
"``SQC_TC_DATA_WRITE_REQ``", "Req", "Number of data write requests to the L2 cache"
|
||||
"``SQC_TC_DATA_ATOMIC_REQ``", "Req", "Number of data atomic requests to the L2 cache"
|
||||
"``SQC_TC_STALL``", "Cycles", "Number of cycles while the valid requests to the L2 cache are stalled"
|
||||
|
||||
.. _vector-l1-cache-subsystem-counters:
|
||||
|
||||
Vector L1 cache subsystem counters
|
||||
---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
The vector L1 cache subsystem counters are further classified into texture addressing unit, texture data
|
||||
unit, vector L1d or texture cache per pipe, and texture cache arbiter counters.
|
||||
|
||||
Texture addressing unit counters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition", "Value range for ``n``"
|
||||
|
||||
"``TA_TA_BUSY[n]``", "Cycles", "Texture addressing unit busy cycles", "0-15"
|
||||
"``TA_TOTAL_WAVEFRONTS[n]``", "Instr", "Number of wavefronts processed by texture addressing unit", "0-15"
|
||||
"``TA_BUFFER_WAVEFRONTS[n]``", "Instr", "Number of buffer wavefronts processed by texture addressing unit", "0-15"
|
||||
"``TA_BUFFER_READ_WAVEFRONTS[n]``", "Instr", "Number of buffer read wavefronts processed by texture addressing unit", "0-15"
|
||||
"``TA_BUFFER_WRITE_WAVEFRONTS[n]``", "Instr", "Number of buffer write wavefronts processed by texture addressing unit", "0-15"
|
||||
"``TA_BUFFER_ATOMIC_WAVEFRONTS[n]``", "Instr", "Number of buffer atomic wavefronts processed by texture addressing unit", "0-15"
|
||||
"``TA_BUFFER_TOTAL_CYCLES[n]``", "Cycles", "Number of buffer cycles (including read and write) issued to texture cache", "0-15"
|
||||
"``TA_BUFFER_COALESCED_READ_CYCLES[n]``", "Cycles", "Number of coalesced buffer read cycles issued to texture cache", "0-15"
|
||||
"``TA_BUFFER_COALESCED_WRITE_CYCLES[n]``", "Cycles", "Number of coalesced buffer write cycles issued to texture cache", "0-15"
|
||||
"``TA_ADDR_STALLED_BY_TC_CYCLES[n]``", "Cycles", "Number of cycles texture addressing unit address path is stalled by texture cache", "0-15"
|
||||
"``TA_DATA_STALLED_BY_TC_CYCLES[n]``", "Cycles", "Number of cycles texture addressing unit data path is stalled by texture cache", "0-15"
|
||||
"``TA_ADDR_STALLED_BY_TD_CYCLES[n]``", "Cycles", "Number of cycles texture addressing unit address path is stalled by texture data unit", "0-15"
|
||||
"``TA_FLAT_WAVEFRONTS[n]``", "Instr", "Number of flat opcode wavefronts processed by texture addressing unit", "0-15"
|
||||
"``TA_FLAT_READ_WAVEFRONTS[n]``", "Instr", "Number of flat opcode read wavefronts processed by texture addressing unit", "0-15"
|
||||
"``TA_FLAT_WRITE_WAVEFRONTS[n]``", "Instr", "Number of flat opcode write wavefronts processed by texture addressing unit", "0-15"
|
||||
"``TA_FLAT_ATOMIC_WAVEFRONTS[n]``", "Instr", "Number of flat opcode atomic wavefronts processed by texture addressing unit", "0-15"
|
||||
|
||||
Texture data unit counters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition", "Value range for ``n``"
|
||||
|
||||
"``TD_TD_BUSY[n]``", "Cycle", "Texture data unit busy cycles while it is processing or waiting for data", "0-15"
|
||||
"``TD_TC_STALL[n]``", "Cycle", "Number of cycles texture data unit is stalled waiting for texture cache data", "0-15"
|
||||
"``TD_SPI_STALL[n]``", "Cycle", "Number of cycles texture data unit is stalled by shader processor input", "0-15"
|
||||
"``TD_LOAD_WAVEFRONT[n]``", "Instr", "Number of wavefront instructions (read, write, atomic)", "0-15"
|
||||
"``TD_STORE_WAVEFRONT[n]``", "Instr", "Number of write wavefront instructions", "0-15"
|
||||
"``TD_ATOMIC_WAVEFRONT[n]``", "Instr", "Number of atomic wavefront instructions", "0-15"
|
||||
"``TD_COALESCABLE_WAVEFRONT[n]``", "Instr", "Number of coalescable wavefronts according to texture addressing unit", "0-15"
|
||||
|
||||
Texture cache per pipe counters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition", "Value range for ``n``"
|
||||
|
||||
"``TCP_GATE_EN1[n]``", "Cycles", "Number of cycles vector L1d interface clocks are turned on", "0-15"
|
||||
"``TCP_GATE_EN2[n]``", "Cycles", "Number of cycles vector L1d core clocks are turned on", "0-15"
|
||||
"``TCP_TD_TCP_STALL_CYCLES[n]``", "Cycles", "Number of cycles texture data unit stalls vector L1d", "0-15"
|
||||
"``TCP_TCR_TCP_STALL_CYCLES[n]``", "Cycles", "Number of cycles texture cache router stalls vector L1d", "0-15"
|
||||
"``TCP_READ_TAGCONFLICT_STALL_CYCLES[n]``", "Cycles", "Number of cycles tag RAM conflict stalls on a read", "0-15"
|
||||
"``TCP_WRITE_TAGCONFLICT_STALL_CYCLES[n]``", "Cycles", "Number of cycles tag RAM conflict stalls on a write", "0-15"
|
||||
"``TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES[n]``", "Cycles", "Number of cycles tag RAM conflict stalls on an atomic", "0-15"
|
||||
"``TCP_PENDING_STALL_CYCLES[n]``", "Cycles", "Number of cycles vector L1d is stalled due to data pending from L2 Cache", "0-15"
|
||||
"``TCP_TCP_TA_DATA_STALL_CYCLES``", "Cycles", "Number of cycles texture cache per pipe stalls texture addressing unit data interface", "NA"
|
||||
"``TCP_TA_TCP_STATE_READ[n]``", "Req", "Number of state reads", "0-15"
|
||||
"``TCP_VOLATILE[n]``", "Req", "Number of L1 volatile pixels or buffers from texture addressing unit", "0-15"
|
||||
"``TCP_TOTAL_ACCESSES[n]``", "Req", "Number of vector L1d accesses. Equals ``TCP_PERF_SEL_TOTAL_READ`+`TCP_PERF_SEL_TOTAL_NONREAD``", "0-15"
|
||||
"``TCP_TOTAL_READ[n]``", "Req", "Number of vector L1d read accesses", "0-15"
|
||||
"``TCP_TOTAL_WRITE[n]``", "Req", "Number of vector L1d write accesses", "0-15"
|
||||
"``TCP_TOTAL_ATOMIC_WITH_RET[n]``", "Req", "Number of vector L1d atomic requests with return", "0-15"
|
||||
"``TCP_TOTAL_ATOMIC_WITHOUT_RET[n]``", "Req", "Number of vector L1d atomic without return", "0-15"
|
||||
"``TCP_TOTAL_WRITEBACK_INVALIDATES[n]``", "Count", "Total number of vector L1d writebacks and invalidates", "0-15"
|
||||
"``TCP_UTCL1_REQUEST[n]``", "Req", "Number of address translation requests to unified translation cache (L1)", "0-15"
|
||||
"``TCP_UTCL1_TRANSLATION_HIT[n]``", "Req", "Number of unified translation cache (L1) translation hits", "0-15"
|
||||
"``TCP_UTCL1_TRANSLATION_MISS[n]``", "Req", "Number of unified translation cache (L1) translation misses", "0-15"
|
||||
"``TCP_UTCL1_PERMISSION_MISS[n]``", "Req", "Number of unified translation cache (L1) permission misses", "0-15"
|
||||
"``TCP_TOTAL_CACHE_ACCESSES[n]``", "Req", "Number of vector L1d cache accesses including hits and misses", "0-15"
|
||||
"``TCP_TCP_LATENCY[n]``", "Cycles", "**MI200 series only** Accumulated wave access latency to vL1D over all wavefronts", "0-15"
|
||||
"``TCP_TCC_READ_REQ_LATENCY[n]``", "Cycles", "**MI200 series only** Total vL1D to L2 request latency over all wavefronts for reads and atomics with return", "0-15"
|
||||
"``TCP_TCC_WRITE_REQ_LATENCY[n]``", "Cycles", "**MI200 series only** Total vL1D to L2 request latency over all wavefronts for writes and atomics without return", "0-15"
|
||||
"``TCP_TCC_READ_REQ[n]``", "Req", "Number of read requests to L2 cache", "0-15"
|
||||
"``TCP_TCC_WRITE_REQ[n]``", "Req", "Number of write requests to L2 cache", "0-15"
|
||||
"``TCP_TCC_ATOMIC_WITH_RET_REQ[n]``", "Req", "Number of atomic requests to L2 cache with return", "0-15"
|
||||
"``TCP_TCC_ATOMIC_WITHOUT_RET_REQ[n]``", "Req", "Number of atomic requests to L2 cache without return", "0-15"
|
||||
"``TCP_TCC_NC_READ_REQ[n]``", "Req", "Number of non-coherently cached read requests to L2 cache", "0-15"
|
||||
"``TCP_TCC_UC_READ_REQ[n]``", "Req", "Number of uncached read requests to L2 cache", "0-15"
|
||||
"``TCP_TCC_CC_READ_REQ[n]``", "Req", "Number of coherently cached read requests to L2 cache", "0-15"
|
||||
"``TCP_TCC_RW_READ_REQ[n]``", "Req", "Number of coherently cached with write read requests to L2 cache", "0-15"
|
||||
"``TCP_TCC_NC_WRITE_REQ[n]``", "Req", "Number of non-coherently cached write requests to L2 cache", "0-15"
|
||||
"``TCP_TCC_UC_WRITE_REQ[n]``", "Req", "Number of uncached write requests to L2 cache", "0-15"
|
||||
"``TCP_TCC_CC_WRITE_REQ[n]``", "Req", "Number of coherently cached write requests to L2 cache", "0-15"
|
||||
"``TCP_TCC_RW_WRITE_REQ[n]``", "Req", "Number of coherently cached with write write requests to L2 cache", "0-15"
|
||||
"``TCP_TCC_NC_ATOMIC_REQ[n]``", "Req", "Number of non-coherently cached atomic requests to L2 cache", "0-15"
|
||||
"``TCP_TCC_UC_ATOMIC_REQ[n]``", "Req", "Number of uncached atomic requests to L2 cache", "0-15"
|
||||
"``TCP_TCC_CC_ATOMIC_REQ[n]``", "Req", "Number of coherently cached atomic requests to L2 cache", "0-15"
|
||||
"``TCP_TCC_RW_ATOMIC_REQ[n]``", "Req", "Number of coherently cached with write atomic requests to L2 cache", "0-15"
|
||||
|
||||
Note that:
|
||||
|
||||
* ``TCP_TOTAL_READ[n]`` = ``TCP_PERF_SEL_TOTAL_HIT_LRU_READ`` + ``TCP_PERF_SEL_TOTAL_MISS_LRU_READ`` + ``TCP_PERF_SEL_TOTAL_MISS_EVICT_READ``
|
||||
* ``TCP_TOTAL_WRITE[n]`` = ``TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE``+ ``TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE``
|
||||
* ``TCP_TOTAL_WRITEBACK_INVALIDATES[n]`` = ``TCP_PERF_SEL_TOTAL_WBINVL1``+ ``TCP_PERF_SEL_TOTAL_WBINVL1_VOL``+ ``TCP_PERF_SEL_CP_TCP_INVALIDATE``+ ``TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL``
|
||||
|
||||
Texture cache arbiter counters
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition", "Value range for ``n``"
|
||||
|
||||
"``TCA_CYCLE[n]``", "Cycles", "Number of texture cache arbiter cycles", "0-31"
|
||||
"``TCA_BUSY[n]``", "Cycles", "Number of cycles texture cache arbiter has a pending request", "0-31"
|
||||
|
||||
.. _l2-cache-access-counters:
|
||||
|
||||
L2 cache access counters
|
||||
---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
L2 cache is also known as texture cache per channel.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI300 hardware counter
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition", "Value range for ``n``"
|
||||
|
||||
"``TCC_CYCLE[n]``", "Cycles", "Number of L2 cache free-running clocks", "0-31"
|
||||
"``TCC_BUSY[n]``", "Cycles", "Number of L2 cache busy cycles", "0-31"
|
||||
"``TCC_REQ[n]``", "Req", "Number of L2 cache requests of all types (measured at the tag block)", "0-31"
|
||||
"``TCC_STREAMING_REQ[n]``", "Req", "Number of L2 cache streaming requests (measured at the tag block)", "0-31"
|
||||
"``TCC_NC_REQ[n]``", "Req", "Number of non-coherently cached requests (measured at the tag block)", "0-31"
|
||||
"``TCC_UC_REQ[n]``", "Req", "Number of uncached requests. This is measured at the tag block", "0-31"
|
||||
"``TCC_CC_REQ[n]``", "Req", "Number of coherently cached requests. This is measured at the tag block", "0-31"
|
||||
"``TCC_RW_REQ[n]``", "Req", "Number of coherently cached with write requests. This is measured at the tag block", "0-31"
|
||||
"``TCC_PROBE[n]``", "Req", "Number of probe requests", "0-31"
|
||||
"``TCC_PROBE_ALL[n]``", "Req", "Number of external probe requests with ``EA_TCC_preq_all == 1``", "0-31"
|
||||
"``TCC_READ[n]``", "Req", "Number of L2 cache read requests (includes compressed reads but not metadata reads)", "0-31"
|
||||
"``TCC_WRITE[n]``", "Req", "Number of L2 cache write requests", "0-31"
|
||||
"``TCC_ATOMIC[n]``", "Req", "Number of L2 cache atomic requests of all types", "0-31"
|
||||
"``TCC_HIT[n]``", "Req", "Number of L2 cache hits", "0-31"
|
||||
"``TCC_MISS[n]``", "Req", "Number of L2 cache misses", "0-31"
|
||||
"``TCC_WRITEBACK[n]``", "Req", "Number of lines written back to the main memory, including writebacks of dirty lines and uncached write or atomic requests", "0-31"
|
||||
"``TCC_EA0_WRREQ[n]``", "Req", "Number of 32-byte and 64-byte transactions going over the ``TC_EA_wrreq`` interface (doesn't include probe commands)", "0-31"
|
||||
"``TCC_EA0_WRREQ_64B[n]``", "Req", "Total number of 64-byte transactions (write or ``CMPSWAP``) going over the ``TC_EA_wrreq`` interface", "0-31"
|
||||
"``TCC_EA0_WR_UNCACHED_32B[n]``", "Req", "Number of 32 or 64-byte write or atomic going over the ``TC_EA_wrreq`` interface due to uncached traffic", "0-31"
|
||||
"``TCC_EA0_WRREQ_STALL[n]``", "Cycles", "Number of cycles a write request is stalled", "0-31"
|
||||
"``TCC_EA0_WRREQ_IO_CREDIT_STALL[n]``", "Cycles", "Number of cycles an efficiency arbiter write request is stalled due to the interface running out of input-output (IO) credits", "0-31"
|
||||
"``TCC_EA0_WRREQ_GMI_CREDIT_STALL[n]``", "Cycles", "Number of cycles an efficiency arbiter write request is stalled due to the interface running out of GMI credits", "0-31"
|
||||
"``TCC_EA0_WRREQ_DRAM_CREDIT_STALL[n]``", "Cycles", "Number of cycles an efficiency arbiter write request is stalled due to the interface running out of DRAM credits", "0-31"
|
||||
"``TCC_TOO_MANY_EA_WRREQS_STALL[n]``", "Cycles", "Number of cycles the L2 cache is unable to send an efficiency arbiter write request due to it reaching its maximum capacity of pending efficiency arbiter write requests", "0-31"
|
||||
"``TCC_EA0_WRREQ_LEVEL[n]``", "Req", "The accumulated number of efficiency arbiter write requests in flight", "0-31"
|
||||
"``TCC_EA0_ATOMIC[n]``", "Req", "Number of 32-byte or 64-byte atomic requests going over the ``TC_EA_wrreq`` interface", "0-31"
|
||||
"``TCC_EA0_ATOMIC_LEVEL[n]``", "Req", "The accumulated number of efficiency arbiter atomic requests in flight", "0-31"
|
||||
"``TCC_EA0_RDREQ[n]``", "Req", "Number of 32-byte or 64-byte read requests to efficiency arbiter", "0-31"
|
||||
"``TCC_EA0_RDREQ_32B[n]``", "Req", "Number of 32-byte read requests to efficiency arbiter", "0-31"
|
||||
"``TCC_EA0_RD_UNCACHED_32B[n]``", "Req", "Number of 32-byte efficiency arbiter reads due to uncached traffic. A 64-byte request is counted as 2", "0-31"
|
||||
"``TCC_EA0_RDREQ_IO_CREDIT_STALL[n]``", "Cycles", "Number of cycles there is a stall due to the read request interface running out of IO credits", "0-31"
|
||||
"``TCC_EA0_RDREQ_GMI_CREDIT_STALL[n]``", "Cycles", "Number of cycles there is a stall due to the read request interface running out of GMI credits", "0-31"
|
||||
"``TCC_EA0_RDREQ_DRAM_CREDIT_STALL[n]``", "Cycles", "Number of cycles there is a stall due to the read request interface running out of DRAM credits", "0-31"
|
||||
"``TCC_EA0_RDREQ_LEVEL[n]``", "Req", "The accumulated number of efficiency arbiter read requests in flight", "0-31"
|
||||
"``TCC_EA0_RDREQ_DRAM[n]``", "Req", "Number of 32-byte or 64-byte efficiency arbiter read requests to High Bandwidth Memory (HBM)", "0-31"
|
||||
"``TCC_EA0_WRREQ_DRAM[n]``", "Req", "Number of 32-byte or 64-byte efficiency arbiter write requests to HBM", "0-31"
|
||||
"``TCC_TAG_STALL[n]``", "Cycles", "Number of cycles the normal request pipeline in the tag is stalled for any reason", "0-31"
|
||||
"``TCC_NORMAL_WRITEBACK[n]``", "Req", "Number of writebacks due to requests that are not writeback requests", "0-31"
|
||||
"``TCC_ALL_TC_OP_WB_WRITEBACK[n]``", "Req", "Number of writebacks due to all ``TC_OP`` writeback requests", "0-31"
|
||||
"``TCC_NORMAL_EVICT[n]``", "Req", "Number of evictions due to requests that are not invalidate or probe requests", "0-31"
|
||||
"``TCC_ALL_TC_OP_INV_EVICT[n]``", "Req", "Number of evictions due to all ``TC_OP`` invalidate requests", "0-31"
|
||||
|
||||
.. tab-item:: MI200 hardware counter
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Unit", "Definition", "Value range for ``n``"
|
||||
|
||||
"``TCC_CYCLE[n]``", "Cycles", "Number of L2 cache free-running clocks", "0-31"
|
||||
"``TCC_BUSY[n]``", "Cycles", "Number of L2 cache busy cycles", "0-31"
|
||||
"``TCC_REQ[n]``", "Req", "Number of L2 cache requests of all types (measured at the tag block)", "0-31"
|
||||
"``TCC_STREAMING_REQ[n]``", "Req", "Number of L2 cache streaming requests (measured at the tag block)", "0-31"
|
||||
"``TCC_NC_REQ[n]``", "Req", "Number of non-coherently cached requests (measured at the tag block)", "0-31"
|
||||
"``TCC_UC_REQ[n]``", "Req", "Number of uncached requests. This is measured at the tag block", "0-31"
|
||||
"``TCC_CC_REQ[n]``", "Req", "Number of coherently cached requests. This is measured at the tag block", "0-31"
|
||||
"``TCC_RW_REQ[n]``", "Req", "Number of coherently cached with write requests. This is measured at the tag block", "0-31"
|
||||
"``TCC_PROBE[n]``", "Req", "Number of probe requests", "0-31"
|
||||
"``TCC_PROBE_ALL[n]``", "Req", "Number of external probe requests with ``EA_TCC_preq_all == 1``", "0-31"
|
||||
"``TCC_READ[n]``", "Req", "Number of L2 cache read requests (includes compressed reads but not metadata reads)", "0-31"
|
||||
"``TCC_WRITE[n]``", "Req", "Number of L2 cache write requests", "0-31"
|
||||
"``TCC_ATOMIC[n]``", "Req", "Number of L2 cache atomic requests of all types", "0-31"
|
||||
"``TCC_HIT[n]``", "Req", "Number of L2 cache hits", "0-31"
|
||||
"``TCC_MISS[n]``", "Req", "Number of L2 cache misses", "0-31"
|
||||
"``TCC_WRITEBACK[n]``", "Req", "Number of lines written back to the main memory, including writebacks of dirty lines and uncached write or atomic requests", "0-31"
|
||||
"``TCC_EA_WRREQ[n]``", "Req", "Number of 32-byte and 64-byte transactions going over the ``TC_EA_wrreq`` interface (doesn't include probe commands)", "0-31"
|
||||
"``TCC_EA_WRREQ_64B[n]``", "Req", "Total number of 64-byte transactions (write or ``CMPSWAP``) going over the ``TC_EA_wrreq`` interface", "0-31"
|
||||
"``TCC_EA_WR_UNCACHED_32B[n]``", "Req", "Number of 32 write or atomic going over the ``TC_EA_wrreq`` interface due to uncached traffic. A 64-byte request will be counted as 2", "0-31"
|
||||
"``TCC_EA_WRREQ_STALL[n]``", "Cycles", "Number of cycles a write request is stalled", "0-31"
|
||||
"``TCC_EA_WRREQ_IO_CREDIT_STALL[n]``", "Cycles", "Number of cycles an efficiency arbiter write request is stalled due to the interface running out of input-output (IO) credits", "0-31"
|
||||
"``TCC_EA_WRREQ_GMI_CREDIT_STALL[n]``", "Cycles", "Number of cycles an efficiency arbiter write request is stalled due to the interface running out of GMI credits", "0-31"
|
||||
"``TCC_EA_WRREQ_DRAM_CREDIT_STALL[n]``", "Cycles", "Number of cycles an efficiency arbiter write request is stalled due to the interface running out of DRAM credits", "0-31"
|
||||
"``TCC_TOO_MANY_EA_WRREQS_STALL[n]``", "Cycles", "Number of cycles the L2 cache is unable to send an efficiency arbiter write request due to it reaching its maximum capacity of pending efficiency arbiter write requests", "0-31"
|
||||
"``TCC_EA_WRREQ_LEVEL[n]``", "Req", "The accumulated number of efficiency arbiter write requests in flight", "0-31"
|
||||
"``TCC_EA_ATOMIC[n]``", "Req", "Number of 32-byte or 64-byte atomic requests going over the ``TC_EA_wrreq`` interface", "0-31"
|
||||
"``TCC_EA_ATOMIC_LEVEL[n]``", "Req", "The accumulated number of efficiency arbiter atomic requests in flight", "0-31"
|
||||
"``TCC_EA_RDREQ[n]``", "Req", "Number of 32-byte or 64-byte read requests to efficiency arbiter", "0-31"
|
||||
"``TCC_EA_RDREQ_32B[n]``", "Req", "Number of 32-byte read requests to efficiency arbiter", "0-31"
|
||||
"``TCC_EA_RD_UNCACHED_32B[n]``", "Req", "Number of 32-byte efficiency arbiter reads due to uncached traffic. A 64-byte request is counted as 2", "0-31"
|
||||
"``TCC_EA_RDREQ_IO_CREDIT_STALL[n]``", "Cycles", "Number of cycles there is a stall due to the read request interface running out of IO credits", "0-31"
|
||||
"``TCC_EA_RDREQ_GMI_CREDIT_STALL[n]``", "Cycles", "Number of cycles there is a stall due to the read request interface running out of GMI credits", "0-31"
|
||||
"``TCC_EA_RDREQ_DRAM_CREDIT_STALL[n]``", "Cycles", "Number of cycles there is a stall due to the read request interface running out of DRAM credits", "0-31"
|
||||
"``TCC_EA_RDREQ_LEVEL[n]``", "Req", "The accumulated number of efficiency arbiter read requests in flight", "0-31"
|
||||
"``TCC_EA_RDREQ_DRAM[n]``", "Req", "Number of 32-byte or 64-byte efficiency arbiter read requests to High Bandwidth Memory (HBM)", "0-31"
|
||||
"``TCC_EA_WRREQ_DRAM[n]``", "Req", "Number of 32-byte or 64-byte efficiency arbiter write requests to HBM", "0-31"
|
||||
"``TCC_TAG_STALL[n]``", "Cycles", "Number of cycles the normal request pipeline in the tag is stalled for any reason", "0-31"
|
||||
"``TCC_NORMAL_WRITEBACK[n]``", "Req", "Number of writebacks due to requests that are not writeback requests", "0-31"
|
||||
"``TCC_ALL_TC_OP_WB_WRITEBACK[n]``", "Req", "Number of writebacks due to all ``TC_OP`` writeback requests", "0-31"
|
||||
"``TCC_NORMAL_EVICT[n]``", "Req", "Number of evictions due to requests that are not invalidate or probe requests", "0-31"
|
||||
"``TCC_ALL_TC_OP_INV_EVICT[n]``", "Req", "Number of evictions due to all ``TC_OP`` invalidate requests", "0-31"
|
||||
|
||||
Note the following:
|
||||
|
||||
* ``TCC_REQ[n]`` may be more than the number of requests arriving at the texture cache per channel,
|
||||
but it's a good indication of the total amount of work that needs to be performed.
|
||||
* For ``TCC_EA0_WRREQ[n]``, atomics may travel over the same interface and are generally classified as
|
||||
write requests.
|
||||
* CC mtypes can produce uncached requests, and those are included in
|
||||
``TCC_EA0_WR_UNCACHED_32B[n]``
|
||||
* ``TCC_EA0_WRREQ_LEVEL[n]`` is primarily intended to measure average efficiency arbiter write latency.
|
||||
|
||||
* Average write latency = ``TCC_PERF_SEL_EA0_WRREQ_LEVEL`` divided by ``TCC_PERF_SEL_EA0_WRREQ``
|
||||
|
||||
* ``TCC_EA0_ATOMIC_LEVEL[n]`` is primarily intended to measure average efficiency arbiter atomic
|
||||
latency
|
||||
|
||||
* Average atomic latency = ``TCC_PERF_SEL_EA0_WRREQ_ATOMIC_LEVEL`` divided by ``TCC_PERF_SEL_EA0_WRREQ_ATOMIC``
|
||||
|
||||
* ``TCC_EA0_RDREQ_LEVEL[n]`` is primarily intended to measure average efficiency arbiter read latency.
|
||||
|
||||
* Average read latency = ``TCC_PERF_SEL_EA0_RDREQ_LEVEL`` divided by ``TCC_PERF_SEL_EA0_RDREQ``
|
||||
|
||||
* Stalls can occur regardless of the need for a read to be performed
|
||||
* Normally, stalls are measured exactly at one point in the pipeline however in the case of
|
||||
``TCC_TAG_STALL[n]``, probes can stall the pipeline at a variety of places. There is no single point that
|
||||
can accurately measure the total stalls
|
||||
|
||||
MI300 and MI200 series derived metrics list
|
||||
==============================================================
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Definition"
|
||||
|
||||
"``ALUStalledByLDS``", "Percentage of GPU time ALU units are stalled due to the LDS input queue being full or the output queue not being ready (value range: 0% (optimal) to 100%)"
|
||||
"``FetchSize``", "Total kilobytes fetched from the video memory; measured with all extra fetches and any cache or memory effects taken into account"
|
||||
"``FlatLDSInsts``", "Average number of flat instructions that read from or write to LDS, run per work item (affected by flow control)"
|
||||
"``FlatVMemInsts``", "Average number of flat instructions that read from or write to the video memory, run per work item (affected by flow control). Includes flat instructions that read from or write to scratch"
|
||||
"``GDSInsts``", "Average number of global data share read or write instructions run per work item (affected by flow control)"
|
||||
"``GPUBusy``", "Percentage of time GPU is busy"
|
||||
"``L2CacheHit``", "Percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache (value range: 0% (no hit) to 100% (optimal))"
|
||||
"``LDSBankConflict``", "Percentage of GPU time LDS is stalled by bank conflicts (value range: 0% (optimal) to 100%)"
|
||||
"``LDSInsts``", "Average number of LDS read or write instructions run per work item (affected by flow control). Excludes flat instructions that read from or write to LDS."
|
||||
"``MemUnitBusy``", "Percentage of GPU time the memory unit is active, which is measured with all extra fetches and writes and any cache or memory effects taken into account (value range: 0% to 100% (fetch-bound))"
|
||||
"``MemUnitStalled``", "Percentage of GPU time the memory unit is stalled (value range: 0% (optimal) to 100%)"
|
||||
"``MemWrites32B``", "Total number of effective 32B write transactions to the memory"
|
||||
"``TCA_BUSY_sum``", "Total number of cycles texture cache arbiter has a pending request, over all texture cache arbiter instances"
|
||||
"``TCA_CYCLE_sum``", "Total number of cycles over all texture cache arbiter instances"
|
||||
"``SALUBusy``", "Percentage of GPU time scalar ALU instructions are processed (value range: 0% to 100% (optimal))"
|
||||
"``SALUInsts``", "Average number of scalar ALU instructions run per work item (affected by flow control)"
|
||||
"``SFetchInsts``", "Average number of scalar fetch instructions from the video memory run per work item (affected by flow control)"
|
||||
"``VALUBusy``", "Percentage of GPU time vector ALU instructions are processed (value range: 0% to 100% (optimal))"
|
||||
"``VALUInsts``", "Average number of vector ALU instructions run per work item (affected by flow control)"
|
||||
"``VALUUtilization``", "Percentage of active vector ALU threads in a wave, where a lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64 (value range: 0%, 100% (optimal - no thread divergence))"
|
||||
"``VFetchInsts``", "Average number of vector fetch instructions from the video memory run per work-item (affected by flow control); excludes flat instructions that fetch from video memory"
|
||||
"``VWriteInsts``", "Average number of vector write instructions to the video memory run per work-item (affected by flow control); excludes flat instructions that write to video memory"
|
||||
"``Wavefronts``", "Total wavefronts"
|
||||
"``WRITE_REQ_32B``", "Total number of 32-byte effective memory writes"
|
||||
"``WriteSize``", "Total kilobytes written to the video memory; measured with all extra fetches and any cache or memory effects taken into account"
|
||||
"``WriteUnitStalled``", "Percentage of GPU time the write unit is stalled (value range: 0% (optimal) to 100%)"
|
||||
|
||||
You can lower ``ALUStalledByLDS`` by reducing LDS bank conflicts or number of LDS accesses.
|
||||
You can lower ``MemUnitStalled`` by reducing the number or size of fetches and writes.
|
||||
``MemUnitBusy`` includes the stall time (``MemUnitStalled``).
|
||||
|
||||
Hardware counters by and over all texture addressing unit instances
|
||||
---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
The following table shows the hardware counters *by* all texture addressing unit instances.
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Definition"
|
||||
|
||||
"``TA_BUFFER_WAVEFRONTS_sum``", "Total number of buffer wavefronts processed"
|
||||
"``TA_BUFFER_READ_WAVEFRONTS_sum``", "Total number of buffer read wavefronts processed"
|
||||
"``TA_BUFFER_WRITE_WAVEFRONTS_sum``", "Total number of buffer write wavefronts processed"
|
||||
"``TA_BUFFER_ATOMIC_WAVEFRONTS_sum``", "Total number of buffer atomic wavefronts processed"
|
||||
"``TA_BUFFER_TOTAL_CYCLES_sum``", "Total number of buffer cycles (including read and write) issued to texture cache"
|
||||
"``TA_BUFFER_COALESCED_READ_CYCLES_sum``", "Total number of coalesced buffer read cycles issued to texture cache"
|
||||
"``TA_BUFFER_COALESCED_WRITE_CYCLES_sum``", "Total number of coalesced buffer write cycles issued to texture cache"
|
||||
"``TA_FLAT_READ_WAVEFRONTS_sum``", "Sum of flat opcode reads processed"
|
||||
"``TA_FLAT_WRITE_WAVEFRONTS_sum``", "Sum of flat opcode writes processed"
|
||||
"``TA_FLAT_WAVEFRONTS_sum``", "Total number of flat opcode wavefronts processed"
|
||||
"``TA_FLAT_ATOMIC_WAVEFRONTS_sum``", "Total number of flat opcode atomic wavefronts processed"
|
||||
"``TA_TOTAL_WAVEFRONTS_sum``", "Total number of wavefronts processed"
|
||||
|
||||
The following table shows the hardware counters *over* all texture addressing unit instances.
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Definition"
|
||||
|
||||
"``TA_ADDR_STALLED_BY_TC_CYCLES_sum``", "Total number of cycles texture addressing unit address path is stalled by texture cache"
|
||||
"``TA_ADDR_STALLED_BY_TD_CYCLES_sum``", "Total number of cycles texture addressing unit address path is stalled by texture data unit"
|
||||
"``TA_BUSY_avr``", "Average number of busy cycles"
|
||||
"``TA_BUSY_max``", "Maximum number of texture addressing unit busy cycles"
|
||||
"``TA_BUSY_min``", "Minimum number of texture addressing unit busy cycles"
|
||||
"``TA_DATA_STALLED_BY_TC_CYCLES_sum``", "Total number of cycles texture addressing unit data path is stalled by texture cache"
|
||||
"``TA_TA_BUSY_sum``", "Total number of texture addressing unit busy cycles"
|
||||
|
||||
Hardware counters over all texture cache per channel instances
|
||||
---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Definition"
|
||||
|
||||
"``TCC_ALL_TC_OP_WB_WRITEBACK_sum``", "Total number of writebacks due to all ``TC_OP`` writeback requests."
|
||||
"``TCC_ALL_TC_OP_INV_EVICT_sum``", "Total number of evictions due to all ``TC_OP`` invalidate requests."
|
||||
"``TCC_ATOMIC_sum``", "Total number of L2 cache atomic requests of all types."
|
||||
"``TCC_BUSY_avr``", "Average number of L2 cache busy cycles."
|
||||
"``TCC_BUSY_sum``", "Total number of L2 cache busy cycles."
|
||||
"``TCC_CC_REQ_sum``", "Total number of coherently cached requests."
|
||||
"``TCC_CYCLE_sum``", "Total number of L2 cache free running clocks."
|
||||
"``TCC_EA0_WRREQ_sum``", "Total number of 32-byte and 64-byte transactions going over the ``TC_EA0_wrreq`` interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands."
|
||||
"``TCC_EA0_WRREQ_64B_sum``", "Total number of 64-byte transactions (write or `CMPSWAP`) going over the ``TC_EA0_wrreq`` interface."
|
||||
"``TCC_EA0_WR_UNCACHED_32B_sum``", "Total Number of 32-byte write or atomic going over the ``TC_EA0_wrreq`` interface due to uncached traffic. Note that coherently cached mtypes can produce uncached requests, and those are included in this. A 64-byte request is counted as 2."
|
||||
"``TCC_EA0_WRREQ_STALL_sum``", "Total Number of cycles a write request is stalled, over all instances."
|
||||
"``TCC_EA0_WRREQ_IO_CREDIT_STALL_sum``", "Total number of cycles an efficiency arbiter write request is stalled due to the interface running out of IO credits, over all instances."
|
||||
"``TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum``", "Total number of cycles an efficiency arbiter write request is stalled due to the interface running out of GMI credits, over all instances."
|
||||
"``TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum``", "Total number of cycles an efficiency arbiter write request is stalled due to the interface running out of DRAM credits, over all instances."
|
||||
"``TCC_EA0_WRREQ_LEVEL_sum``", "Total number of efficiency arbiter write requests in flight."
|
||||
"``TCC_EA0_RDREQ_LEVEL_sum``", "Total number of efficiency arbiter read requests in flight."
|
||||
"``TCC_EA0_ATOMIC_sum``", "Total Number of 32-byte or 64-byte atomic requests going over the ``TC_EA0_wrreq`` interface."
|
||||
"``TCC_EA0_ATOMIC_LEVEL_sum``", "Total number of efficiency arbiter atomic requests in flight."
|
||||
"``TCC_EA0_RDREQ_sum``", "Total number of 32-byte or 64-byte read requests to efficiency arbiter."
|
||||
"``TCC_EA0_RDREQ_32B_sum``", "Total number of 32-byte read requests to efficiency arbiter."
|
||||
"``TCC_EA0_RD_UNCACHED_32B_sum``", "Total number of 32-byte efficiency arbiter reads due to uncached traffic."
|
||||
"``TCC_EA0_RDREQ_IO_CREDIT_STALL_sum``", "Total number of cycles there is a stall due to the read request interface running out of IO credits."
|
||||
"``TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum``", "Total number of cycles there is a stall due to the read request interface running out of GMI credits."
|
||||
"``TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum``", "Total number of cycles there is a stall due to the read request interface running out of DRAM credits."
|
||||
"``TCC_EA0_RDREQ_DRAM_sum``", "Total number of 32-byte or 64-byte efficiency arbiter read requests to HBM."
|
||||
"``TCC_EA0_WRREQ_DRAM_sum``", "Total number of 32-byte or 64-byte efficiency arbiter write requests to HBM."
|
||||
"``TCC_HIT_sum``", "Total number of L2 cache hits."
|
||||
"``TCC_MISS_sum``", "Total number of L2 cache misses."
|
||||
"``TCC_NC_REQ_sum``", "Total number of non-coherently cached requests."
|
||||
"``TCC_NORMAL_WRITEBACK_sum``", "Total number of writebacks due to requests that are not writeback requests."
|
||||
"``TCC_NORMAL_EVICT_sum``", "Total number of evictions due to requests that are not invalidate or probe requests."
|
||||
"``TCC_PROBE_sum``", "Total number of probe requests."
|
||||
"``TCC_PROBE_ALL_sum``", "Total number of external probe requests with ``EA0_TCC_preq_all == 1``."
|
||||
"``TCC_READ_sum``", "Total number of L2 cache read requests (including compressed reads but not metadata reads)."
|
||||
"``TCC_REQ_sum``", "Total number of all types of L2 cache requests."
|
||||
"``TCC_RW_REQ_sum``", "Total number of coherently cached with write requests."
|
||||
"``TCC_STREAMING_REQ_sum``", "Total number of L2 cache streaming requests."
|
||||
"``TCC_TAG_STALL_sum``", "Total number of cycles the normal request pipeline in the tag is stalled for any reason."
|
||||
"``TCC_TOO_MANY_EA0_WRREQS_STALL_sum``", "Total number of cycles L2 cache is unable to send an efficiency arbiter write request due to it reaching its maximum capacity of pending efficiency arbiter write requests."
|
||||
"``TCC_UC_REQ_sum``", "Total number of uncached requests."
|
||||
"``TCC_WRITE_sum``", "Total number of L2 cache write requests."
|
||||
"``TCC_WRITEBACK_sum``", "Total number of lines written back to the main memory including writebacks of dirty lines and uncached write or atomic requests."
|
||||
"``TCC_WRREQ_STALL_max``", "Maximum number of cycles a write request is stalled."
|
||||
|
||||
Hardware counters by, for, or over all texture cache per pipe instances
|
||||
----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
The following table shows the hardware counters *by* all texture cache per pipe instances.
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Definition"
|
||||
|
||||
"``TCP_TA_TCP_STATE_READ_sum``", "Total number of state reads by ATCPPI"
|
||||
"``TCP_TOTAL_CACHE_ACCESSES_sum``", "Total number of vector L1d accesses (including hits and misses)"
|
||||
"``TCP_UTCL1_PERMISSION_MISS_sum``", "Total number of unified translation cache (L1) permission misses"
|
||||
"``TCP_UTCL1_REQUEST_sum``", "Total number of address translation requests to unified translation cache (L1)"
|
||||
"``TCP_UTCL1_TRANSLATION_MISS_sum``", "Total number of unified translation cache (L1) translation misses"
|
||||
"``TCP_UTCL1_TRANSLATION_HIT_sum``", "Total number of unified translation cache (L1) translation hits"
|
||||
|
||||
The following table shows the hardware counters *for* all texture cache per pipe instances.
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Definition"
|
||||
|
||||
"``TCP_TCC_READ_REQ_LATENCY_sum``", "Total vector L1d to L2 request latency over all wavefronts for reads and atomics with return"
|
||||
"``TCP_TCC_WRITE_REQ_LATENCY_sum``", "Total vector L1d to L2 request latency over all wavefronts for writes and atomics without return"
|
||||
"``TCP_TCP_LATENCY_sum``", "Total wave access latency to vector L1d over all wavefronts"
|
||||
|
||||
The following table shows the hardware counters *over* all texture cache per pipe instances.
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Definition"
|
||||
|
||||
"``TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum``", "Total number of cycles tag RAM conflict stalls on an atomic"
|
||||
"``TCP_GATE_EN1_sum``", "Total number of cycles vector L1d interface clocks are turned on"
|
||||
"``TCP_GATE_EN2_sum``", "Total number of cycles vector L1d core clocks are turned on"
|
||||
"``TCP_PENDING_STALL_CYCLES_sum``", "Total number of cycles vector L1d cache is stalled due to data pending from L2 Cache"
|
||||
"``TCP_READ_TAGCONFLICT_STALL_CYCLES_sum``", "Total number of cycles tag RAM conflict stalls on a read"
|
||||
"``TCP_TCC_ATOMIC_WITH_RET_REQ_sum``", "Total number of atomic requests to L2 cache with return"
|
||||
"``TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum``", "Total number of atomic requests to L2 cache without return"
|
||||
"``TCP_TCC_CC_READ_REQ_sum``", "Total number of coherently cached read requests to L2 cache"
|
||||
"``TCP_TCC_CC_WRITE_REQ_sum``", "Total number of coherently cached write requests to L2 cache"
|
||||
"``TCP_TCC_CC_ATOMIC_REQ_sum``", "Total number of coherently cached atomic requests to L2 cache"
|
||||
"``TCP_TCC_NC_READ_REQ_sum``", "Total number of non-coherently cached read requests to L2 cache"
|
||||
"``TCP_TCC_NC_WRITE_REQ_sum``", "Total number of non-coherently cached write requests to L2 cache"
|
||||
"``TCP_TCC_NC_ATOMIC_REQ_sum``", "Total number of non-coherently cached atomic requests to L2 cache"
|
||||
"``TCP_TCC_READ_REQ_sum``", "Total number of read requests to L2 cache"
|
||||
"``TCP_TCC_RW_READ_REQ_sum``", "Total number of coherently cached with write read requests to L2 cache"
|
||||
"``TCP_TCC_RW_WRITE_REQ_sum``", "Total number of coherently cached with write write requests to L2 cache"
|
||||
"``TCP_TCC_RW_ATOMIC_REQ_sum``", "Total number of coherently cached with write atomic requests to L2 cache"
|
||||
"``TCP_TCC_UC_READ_REQ_sum``", "Total number of uncached read requests to L2 cache"
|
||||
"``TCP_TCC_UC_WRITE_REQ_sum``", "Total number of uncached write requests to L2 cache"
|
||||
"``TCP_TCC_UC_ATOMIC_REQ_sum``", "Total number of uncached atomic requests to L2 cache"
|
||||
"``TCP_TCC_WRITE_REQ_sum``", "Total number of write requests to L2 cache"
|
||||
"``TCP_TCR_TCP_STALL_CYCLES_sum``", "Total number of cycles texture cache router stalls vector L1d"
|
||||
"``TCP_TD_TCP_STALL_CYCLES_sum``", "Total number of cycles texture data unit stalls vector L1d"
|
||||
"``TCP_TOTAL_ACCESSES_sum``", "Total number of vector L1d accesses"
|
||||
"``TCP_TOTAL_READ_sum``", "Total number of vector L1d read accesses"
|
||||
"``TCP_TOTAL_WRITE_sum``", "Total number of vector L1d write accesses"
|
||||
"``TCP_TOTAL_ATOMIC_WITH_RET_sum``", "Total number of vector L1d atomic requests with return"
|
||||
"``TCP_TOTAL_ATOMIC_WITHOUT_RET_sum``", "Total number of vector L1d atomic requests without return"
|
||||
"``TCP_TOTAL_WRITEBACK_INVALIDATES_sum``", "Total number of vector L1d writebacks and invalidates"
|
||||
"``TCP_VOLATILE_sum``", "Total number of L1 volatile pixels or buffers from texture addressing unit"
|
||||
"``TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum``", "Total number of cycles tag RAM conflict stalls on a write"
|
||||
|
||||
Hardware counter over all texture data unit instances
|
||||
--------------------------------------------------------
|
||||
|
||||
.. csv-table::
|
||||
:header: "Hardware counter", "Definition"
|
||||
|
||||
"``TD_ATOMIC_WAVEFRONT_sum``", "Total number of atomic wavefront instructions"
|
||||
"``TD_COALESCABLE_WAVEFRONT_sum``", "Total number of coalescable wavefronts according to texture addressing unit"
|
||||
"``TD_LOAD_WAVEFRONT_sum``", "Total number of wavefront instructions (read, write, atomic)"
|
||||
"``TD_SPI_STALL_sum``", "Total number of cycles texture data unit is stalled by shader processor input"
|
||||
"``TD_STORE_WAVEFRONT_sum``", "Total number of write wavefront instructions"
|
||||
"``TD_TC_STALL_sum``", "Total number of cycles texture data unit is stalled waiting for texture cache data"
|
||||
"``TD_TD_BUSY_sum``", "Total number of texture data unit busy cycles while it is processing or waiting for data"
|
||||
@@ -1,129 +0,0 @@
|
||||
---
|
||||
myst:
|
||||
html_meta:
|
||||
"description lang=en": "Learn about the AMD Instinct MI300 series architecture."
|
||||
"keywords": "Instinct, MI300X, MI300A, microarchitecture, AMD, ROCm"
|
||||
---
|
||||
|
||||
# AMD Instinct™ MI300 series microarchitecture
|
||||
|
||||
The AMD Instinct MI300 series accelerators are based on the AMD CDNA 3
|
||||
architecture which was designed to deliver leadership performance for HPC, artificial intelligence (AI), and machine
|
||||
learning (ML) workloads. The AMD Instinct MI300 series accelerators are well-suited for extreme scalability and compute performance, running
|
||||
on everything from individual servers to the world’s largest exascale supercomputers.
|
||||
|
||||
With the MI300 series, AMD is introducing the Accelerator Complex Die (XCD), which contains the
|
||||
GPU computational elements of the processor along with the lower levels of the cache hierarchy.
|
||||
|
||||
The following image depicts the structure of a single XCD in the AMD Instinct MI300 accelerator series.
|
||||
|
||||
```{figure} ../../data/shared/xcd-sys-arch.png
|
||||
---
|
||||
name: mi300-xcd
|
||||
align: center
|
||||
---
|
||||
XCD-level system architecture showing 40 Compute Units, each with 32 KB L1 cache, a Unified Compute System with 4 ACE Compute Accelerators, shared 4MB of L2 cache and an HWS Hardware Scheduler.
|
||||
```
|
||||
|
||||
On the XCD, four Asynchronous Compute Engines (ACEs) send compute shader workgroups to the
|
||||
Compute Units (CUs). The XCD has 40 CUs: 38 active CUs at the aggregate level and 2 disabled CUs for
|
||||
yield management. The CUs all share a 4 MB L2 cache that serves to coalesce all memory traffic for the
|
||||
die. With less than half of the CUs of the AMD Instinct MI200 Series compute die, the AMD CDNA™ 3
|
||||
XCD die is a smaller building block. However, it uses more advanced packaging and the processor
|
||||
can include 6 or 8 XCDs for up to 304 CUs, roughly 40% more than MI250X.
|
||||
|
||||
The MI300 Series integrate up to 8 vertically stacked XCDs, 8 stacks of
|
||||
High-Bandwidth Memory 3 (HBM3) and 4 I/O dies (containing system
|
||||
infrastructure) using the AMD Infinity Fabric™ technology as interconnect.
|
||||
|
||||
The Matrix Cores inside the CDNA 3 CUs have significant improvements, emphasizing AI and machine
|
||||
learning, enhancing throughput of existing data types while adding support for new data types.
|
||||
CDNA 2 Matrix Cores support FP16 and BF16, while offering INT8 for inference. Compared to MI250X
|
||||
accelerators, CDNA 3 Matrix Cores triple the performance for FP16 and BF16, while providing a
|
||||
performance gain of 6.8 times for INT8. FP8 has a performance gain of 16 times compared to FP32,
|
||||
while TF32 has a gain of 4 times compared to FP32.
|
||||
|
||||
```{list-table} Peak-performance capabilities of the MI300X for different data types.
|
||||
:header-rows: 1
|
||||
:name: mi300x-perf-table
|
||||
|
||||
*
|
||||
- Computation and Data Type
|
||||
- FLOPS/CLOCK/CU
|
||||
- Peak TFLOPS
|
||||
*
|
||||
- Matrix FP64
|
||||
- 256
|
||||
- 163.4
|
||||
*
|
||||
- Vector FP64
|
||||
- 128
|
||||
- 81.7
|
||||
*
|
||||
- Matrix FP32
|
||||
- 256
|
||||
- 163.4
|
||||
*
|
||||
- Vector FP32
|
||||
- 256
|
||||
- 163.4
|
||||
*
|
||||
- Vector TF32
|
||||
- 1024
|
||||
- 653.7
|
||||
*
|
||||
- Matrix FP16
|
||||
- 2048
|
||||
- 1307.4
|
||||
*
|
||||
- Matrix BF16
|
||||
- 2048
|
||||
- 1307.4
|
||||
*
|
||||
- Matrix FP8
|
||||
- 4096
|
||||
- 2614.9
|
||||
*
|
||||
- Matrix INT8
|
||||
- 4096
|
||||
- 2614.9
|
||||
```
|
||||
|
||||
The above table summarizes the aggregated peak performance of the AMD Instinct MI300X Open
|
||||
Compute Platform (OCP) Open Accelerator Modules (OAMs) for different data types and command
|
||||
processors. The middle column lists the peak performance (number of data elements processed in a
|
||||
single instruction) of a single compute unit if a SIMD (or matrix) instruction is submitted in each clock
|
||||
cycle. The third column lists the theoretical peak performance of the OAM. The theoretical aggregated
|
||||
peak memory bandwidth of the GPU is 5.3 TB per second.
|
||||
|
||||
The following image shows the block diagram of the APU (left) and the OAM package (right) both
|
||||
connected via AMD Infinity Fabric™ network on-chip.
|
||||
|
||||
```{figure} ../../data/conceptual/gpu-arch/image008.png
|
||||
---
|
||||
name: mi300-arch
|
||||
alt:
|
||||
align: center
|
||||
---
|
||||
MI300 series system architecture showing MI300A (left) with 6 XCDs and 3 CCDs, while the MI300X (right) has 8 XCDs.
|
||||
```
|
||||
|
||||
## Node-level architecture
|
||||
|
||||
```{figure} ../../data/shared/mi300-node-level-arch.png
|
||||
---
|
||||
name: mi300-node
|
||||
|
||||
align: center
|
||||
---
|
||||
MI300 series node-level architecture showing 8 fully interconnected MI300X OAM modules connected to (optional) PCIEe switches via retimers and HGX connectors.
|
||||
```
|
||||
|
||||
The image above shows the node-level architecture of a system with AMD EPYC processors in a
|
||||
dual-socket configuration and eight AMD Instinct MI300X accelerators. The MI300X OAMs attach to the
|
||||
host system via PCIe Gen 5 x16 links (yellow lines). The GPUs are using seven high-bandwidth,
|
||||
low-latency AMD Infinity Fabric™ links (red lines) to form a fully connected 8-GPU system.
|
||||
|
||||
<!---
|
||||
We need performance data about the P2P communication here.
|
||||
-->
|
||||
@@ -1,530 +0,0 @@
|
||||
.. meta::
|
||||
:description: MI355 series performance counters and metrics
|
||||
:keywords: MI355, MI355X, MI3XX
|
||||
|
||||
***********************************
|
||||
MI350 series performance counters
|
||||
***********************************
|
||||
|
||||
This topic lists and describes the hardware performance counters and derived metrics available on the AMD Instinct MI350 and MI355 accelerators. These counters are available for profiling using `ROCprofiler-SDK <https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/index.html>`_ and `ROCm Compute Profiler <https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/>`_.
|
||||
|
||||
The following sections list the performance counters based on the IP blocks.
|
||||
|
||||
Command processor packet processor counters (CPC)
|
||||
==================================================
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Hardware counter
|
||||
- Definition
|
||||
|
||||
* - CPC_ALWAYS_COUNT
|
||||
- Always count.
|
||||
|
||||
* - CPC_ADC_VALID_CHUNK_NOT_AVAIL
|
||||
- ADC valid chunk is not available when dispatch walking is in progress in the multi-xcc mode.
|
||||
|
||||
* - CPC_ADC_DISPATCH_ALLOC_DONE
|
||||
- ADC dispatch allocation is done.
|
||||
|
||||
* - CPC_ADC_VALID_CHUNK_END
|
||||
- ADC crawler's valid chunk end in the multi-xcc mode.
|
||||
|
||||
* - CPC_SYNC_FIFO_FULL_LEVEL
|
||||
- SYNC FIFO full last cycles.
|
||||
|
||||
* - CPC_SYNC_FIFO_FULL
|
||||
- SYNC FIFO full times.
|
||||
|
||||
* - CPC_GD_BUSY
|
||||
- ADC busy.
|
||||
|
||||
* - CPC_TG_SEND
|
||||
- ADC thread group send.
|
||||
|
||||
* - CPC_WALK_NEXT_CHUNK
|
||||
- ADC walking next valid chunk in the multi-xcc mode.
|
||||
|
||||
* - CPC_STALLED_BY_SE0_SPI
|
||||
- ADC CSDATA stalled by SE0SPI.
|
||||
|
||||
* - CPC_STALLED_BY_SE1_SPI
|
||||
- ADC CSDATA stalled by SE1SPI.
|
||||
|
||||
* - CPC_STALLED_BY_SE2_SPI
|
||||
- ADC CSDATA stalled by SE2SPI.
|
||||
|
||||
* - CPC_STALLED_BY_SE3_SPI
|
||||
- ADC CSDATA stalled by SE3SPI.
|
||||
|
||||
* - CPC_LTE_ALL
|
||||
- CPC sync counter LteAll. Only Master XCD manages LteAll.
|
||||
|
||||
* - CPC_SYNC_WRREQ_FIFO_BUSY
|
||||
- CPC sync counter request FIFO is not empty.
|
||||
|
||||
* - CPC_CANE_BUSY
|
||||
- CPC CANE bus is busy, which indicates the presence of inflight sync counter requests.
|
||||
|
||||
* - CPC_CANE_STALL
|
||||
- CPC sync counter sending is stalled by CANE.
|
||||
|
||||
Shader pipe interpolators (SPI) counters
|
||||
=========================================
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Hardware counter
|
||||
- Definition
|
||||
|
||||
* - SPI_CS0_WINDOW_VALID
|
||||
- Clock count enabled by PIPE0 perfcounter_start event.
|
||||
|
||||
* - SPI_CS0_BUSY
|
||||
- Number of clocks with outstanding waves for PIPE0 (SPI or SH).
|
||||
|
||||
* - SPI_CS0_NUM_THREADGROUPS
|
||||
- Number of thread groups launched for PIPE0.
|
||||
|
||||
* - SPI_CS0_CRAWLER_STALL
|
||||
- Number of clocks when PIPE0 event or wave order FIFO is full.
|
||||
|
||||
* - SPI_CS0_EVENT_WAVE
|
||||
- Number of PIPE0 events and waves.
|
||||
|
||||
* - SPI_CS0_WAVE
|
||||
- Number of PIPE0 waves.
|
||||
|
||||
* - SPI_CS1_WINDOW_VALID
|
||||
- Clock count enabled by PIPE1 perfcounter_start event.
|
||||
|
||||
* - SPI_CS1_BUSY
|
||||
- Number of clocks with outstanding waves for PIPE1 (SPI or SH).
|
||||
|
||||
* - SPI_CS1_NUM_THREADGROUPS
|
||||
- Number of thread groups launched for PIPE1.
|
||||
|
||||
* - SPI_CS1_CRAWLER_STALL
|
||||
- Number of clocks when PIPE1 event or wave order FIFO is full.
|
||||
|
||||
* - SPI_CS1_EVENT_WAVE
|
||||
- Number of PIPE1 events and waves.
|
||||
|
||||
* - SPI_CS1_WAVE
|
||||
- Number of PIPE1 waves.
|
||||
|
||||
* - SPI_CS2_WINDOW_VALID
|
||||
- Clock count enabled by PIPE2 perfcounter_start event.
|
||||
|
||||
* - SPI_CS2_BUSY
|
||||
- Number of clocks with outstanding waves for PIPE2 (SPI or SH).
|
||||
|
||||
* - SPI_CS2_NUM_THREADGROUPS
|
||||
- Number of thread groups launched for PIPE2.
|
||||
|
||||
* - SPI_CS2_CRAWLER_STALL
|
||||
- Number of clocks when PIPE2 event or wave order FIFO is full.
|
||||
|
||||
* - SPI_CS2_EVENT_WAVE
|
||||
- Number of PIPE2 events and waves.
|
||||
|
||||
* - SPI_CS2_WAVE
|
||||
- Number of PIPE2 waves.
|
||||
|
||||
* - SPI_CS3_WINDOW_VALID
|
||||
- Clock count enabled by PIPE3 perfcounter_start event.
|
||||
|
||||
* - SPI_CS3_BUSY
|
||||
- Number of clocks with outstanding waves for PIPE3 (SPI or SH).
|
||||
|
||||
* - SPI_CS3_NUM_THREADGROUPS
|
||||
- Number of thread groups launched for PIPE3.
|
||||
|
||||
* - SPI_CS3_CRAWLER_STALL
|
||||
- Number of clocks when PIPE3 event or wave order FIFO is full.
|
||||
|
||||
* - SPI_CS3_EVENT_WAVE
|
||||
- Number of PIPE3 events and waves.
|
||||
|
||||
* - SPI_CS3_WAVE
|
||||
- Number of PIPE3 waves.
|
||||
|
||||
* - SPI_CSQ_P0_Q0_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE0 Queue0.
|
||||
|
||||
* - SPI_CSQ_P0_Q1_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE0 Queue1.
|
||||
|
||||
* - SPI_CSQ_P0_Q2_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE0 Queue2.
|
||||
|
||||
* - SPI_CSQ_P0_Q3_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE0 Queue3.
|
||||
|
||||
* - SPI_CSQ_P0_Q4_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE0 Queue4.
|
||||
|
||||
* - SPI_CSQ_P0_Q5_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE0 Queue5.
|
||||
|
||||
* - SPI_CSQ_P0_Q6_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE0 Queue6.
|
||||
|
||||
* - SPI_CSQ_P0_Q7_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE0 Queue7.
|
||||
|
||||
* - SPI_CSQ_P1_Q0_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE1 Queue0.
|
||||
|
||||
* - SPI_CSQ_P1_Q1_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE1 Queue1.
|
||||
|
||||
* - SPI_CSQ_P1_Q2_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE1 Queue2.
|
||||
|
||||
* - SPI_CSQ_P1_Q3_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE1 Queue3.
|
||||
|
||||
* - SPI_CSQ_P1_Q4_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE1 Queue4.
|
||||
|
||||
* - SPI_CSQ_P1_Q5_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE1 Queue5.
|
||||
|
||||
* - SPI_CSQ_P1_Q6_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE1 Queue6.
|
||||
|
||||
* - SPI_CSQ_P1_Q7_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE1 Queue7.
|
||||
|
||||
* - SPI_CSQ_P2_Q0_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE2 Queue0.
|
||||
|
||||
* - SPI_CSQ_P2_Q1_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE2 Queue1.
|
||||
|
||||
* - SPI_CSQ_P2_Q2_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE2 Queue2.
|
||||
|
||||
* - SPI_CSQ_P2_Q3_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE2 Queue3.
|
||||
|
||||
* - SPI_CSQ_P2_Q4_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE2 Queue4.
|
||||
|
||||
* - SPI_CSQ_P2_Q5_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE2 Queue5.
|
||||
|
||||
* - SPI_CSQ_P2_Q6_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE2 Queue6.
|
||||
|
||||
* - SPI_CSQ_P2_Q7_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE2 Queue7.
|
||||
|
||||
* - SPI_CSQ_P3_Q0_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE3 Queue0.
|
||||
|
||||
* - SPI_CSQ_P3_Q1_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE3 Queue1.
|
||||
|
||||
* - SPI_CSQ_P3_Q2_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE3 Queue2.
|
||||
|
||||
* - SPI_CSQ_P3_Q3_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE3 Queue3.
|
||||
|
||||
* - SPI_CSQ_P3_Q4_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE3 Queue4.
|
||||
|
||||
* - SPI_CSQ_P3_Q5_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE3 Queue5.
|
||||
|
||||
* - SPI_CSQ_P3_Q6_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE3 Queue6.
|
||||
|
||||
* - SPI_CSQ_P3_Q7_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE3 Queue7.
|
||||
|
||||
* - SPI_CSQ_P0_OCCUPANCY
|
||||
- Sum of occupancy info for all PIPE0 queues.
|
||||
|
||||
* - SPI_CSQ_P1_OCCUPANCY
|
||||
- Sum of occupancy info for all PIPE1 queues.
|
||||
|
||||
* - SPI_CSQ_P2_OCCUPANCY
|
||||
- Sum of occupancy info for all PIPE2 queues.
|
||||
|
||||
* - SPI_CSQ_P3_OCCUPANCY
|
||||
- Sum of occupancy info for all PIPE3 queues.
|
||||
|
||||
* - SPI_VWC0_VDATA_VALID_WR
|
||||
- Number of clocks VGPR bus_0 writes VGPRs.
|
||||
|
||||
* - SPI_VWC1_VDATA_VALID_WR
|
||||
- Number of clocks VGPR bus_1 writes VGPRs.
|
||||
|
||||
* - SPI_CSC_WAVE_CNT_BUSY
|
||||
- Number of cycles when there is any wave in the pipe.
|
||||
|
||||
Compute unit (SQ) counters
|
||||
===========================
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Hardware counter
|
||||
- Definition
|
||||
|
||||
* - SQ_INSTS_VALU_MFMA_F6F4
|
||||
- Number of VALU V_MFMA_*_F6F4 instructions.
|
||||
|
||||
* - SQ_INSTS_VALU_MFMA_MOPS_F6F4
|
||||
- Number of VALU matrix with the performed math operations (add or mul) divided by 512, assuming a full EXEC mask of F6 or F4 data type.
|
||||
|
||||
* - SQ_ACTIVE_INST_VALU2
|
||||
- Number of quad-cycles when two VALU instructions are issued (per-simd, nondeterministic).
|
||||
|
||||
* - SQ_INSTS_LDS_LOAD
|
||||
- Number of LDS load instructions issued (per-simd, emulated).
|
||||
|
||||
* - SQ_INSTS_LDS_STORE
|
||||
- Number of LDS store instructions issued (per-simd, emulated).
|
||||
|
||||
* - SQ_INSTS_LDS_ATOMIC
|
||||
- Number of LDS atomic instructions issued (per-simd, emulated).
|
||||
|
||||
* - SQ_INSTS_LDS_LOAD_BANDWIDTH
|
||||
- Total number of 64-bytes loaded (instrSize * CountOnes(EXEC))/64 (per-simd, emulated).
|
||||
|
||||
* - SQ_INSTS_LDS_STORE_BANDWIDTH
|
||||
- Total number of 64-bytes written (instrSize * CountOnes(EXEC))/64 (per-simd, emulated).
|
||||
|
||||
* - SQ_INSTS_LDS_ATOMIC_BANDWIDTH
|
||||
- Total number of 64-bytes atomic (instrSize * CountOnes(EXEC))/64 (per-simd, emulated).
|
||||
|
||||
* - SQ_INSTS_VALU_FLOPS_FP16
|
||||
- Counts FLOPS per instruction on float 16 excluding MFMA/SMFMA.
|
||||
|
||||
* - SQ_INSTS_VALU_FLOPS_FP32
|
||||
- Counts FLOPS per instruction on float 32 excluding MFMA/SMFMA.
|
||||
|
||||
* - SQ_INSTS_VALU_FLOPS_FP64
|
||||
- Counts FLOPS per instruction on float 64 excluding MFMA/SMFMA.
|
||||
|
||||
* - SQ_INSTS_VALU_FLOPS_FP16_TRANS
|
||||
- Counts FLOPS per instruction on float 16 trans excluding MFMA/SMFMA.
|
||||
|
||||
* - SQ_INSTS_VALU_FLOPS_FP32_TRANS
|
||||
- Counts FLOPS per instruction on float 32 trans excluding MFMA/SMFMA.
|
||||
|
||||
* - SQ_INSTS_VALU_FLOPS_FP64_TRANS
|
||||
- Counts FLOPS per instruction on float 64 trans excluding MFMA/SMFMA.
|
||||
|
||||
* - SQ_INSTS_VALU_IOPS
|
||||
- Counts OPS per instruction on integer or unsigned or bit data (per-simd, emulated).
|
||||
|
||||
* - SQ_LDS_DATA_FIFO_FULL
|
||||
- Number of cycles LDS data FIFO is full (nondeterministic, unwindowed).
|
||||
|
||||
* - SQ_LDS_CMD_FIFO_FULL
|
||||
- Number of cycles LDS command FIFO is full (nondeterministic, unwindowed).
|
||||
|
||||
* - SQ_VMEM_TA_ADDR_FIFO_FULL
|
||||
- Number of cycles texture requests are stalled due to full address FIFO in TA (nondeterministic, unwindowed).
|
||||
|
||||
* - SQ_VMEM_TA_CMD_FIFO_FULL
|
||||
- Number of cycles texture requests are stalled due to full cmd FIFO in TA (nondeterministic, unwindowed).
|
||||
|
||||
* - SQ_VMEM_WR_TA_DATA_FIFO_FULL
|
||||
- Number of cycles texture writes are stalled due to full data FIFO in TA (nondeterministic, unwindowed).
|
||||
|
||||
* - SQC_ICACHE_MISSES_DUPLICATE
|
||||
- Number of duplicate misses (access to a non-resident, miss pending CL) (per-SQ, per-Bank, nondeterministic).
|
||||
|
||||
* - SQC_DCACHE_MISSES_DUPLICATE
|
||||
- Number of duplicate misses (access to a non-resident, miss pending CL) (per-SQ, per-Bank, nondeterministic).
|
||||
|
||||
Texture addressing (TA) unit counters
|
||||
======================================
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Hardware counter
|
||||
- Definition
|
||||
|
||||
* - TA_BUFFER_READ_LDS_WAVEFRONTS
|
||||
- Number of buffer read wavefronts for LDS return processed by the TA.
|
||||
|
||||
* - TA_FLAT_READ_LDS_WAVEFRONTS
|
||||
- Number of flat opcode reads for LDS return processed by the TA.
|
||||
|
||||
Texture data (TD) unit counters
|
||||
================================
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Hardware counter
|
||||
- Definition
|
||||
|
||||
* - TD_WRITE_ACKT_WAVEFRONT
|
||||
- Number of write acknowledgments, sent to SQ and not to SP.
|
||||
|
||||
* - TD_TD_SP_TRAFFIC
|
||||
- Number of times this TD sends data to the SP.
|
||||
|
||||
Texture cache per pipe (TCP) counters
|
||||
======================================
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Hardware counter
|
||||
- Definition
|
||||
|
||||
* - TCP_TCP_TA_ADDR_STALL_CYCLES
|
||||
- TCP stalls TA addr interface.
|
||||
|
||||
* - TCP_TCP_TA_DATA_STALL_CYCLES
|
||||
- TCP stalls TA data interface. Now windowed.
|
||||
|
||||
* - TCP_LFIFO_STALL_CYCLES
|
||||
- Memory latency FIFOs full stall.
|
||||
|
||||
* - TCP_RFIFO_STALL_CYCLES
|
||||
- Memory Request FIFOs full stall.
|
||||
|
||||
* - TCP_TCR_RDRET_STALL
|
||||
- Write into cache stalled by read return from TCR.
|
||||
|
||||
* - TCP_PENDING_STALL_CYCLES
|
||||
- Stall due to data pending from L2.
|
||||
|
||||
* - TCP_UTCL1_SERIALIZATION_STALL
|
||||
- Total number of stalls caused due to serializing translation requests through the UTCL1.
|
||||
|
||||
* - TCP_UTCL1_THRASHING_STALL
|
||||
- Stall caused by thrashing feature in any probe. Lacks accuracy when the stall signal overlaps between probe0 and probe1, which is worse with MECO of thrashing deadlock. Some probe0 events could miss being counted in with MECO on. This perf count provides a rough thrashing estimate.
|
||||
|
||||
* - TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS
|
||||
- Translation miss_under_miss.
|
||||
|
||||
* - TCP_UTCL1_STALL_INFLIGHT_MAX
|
||||
- Total UTCL1 stalls due to inflight counter saturation.
|
||||
|
||||
* - TCP_UTCL1_STALL_LRU_INFLIGHT
|
||||
- Total UTCL1 stalls due to LRU cache line with inflight traffic.
|
||||
|
||||
* - TCP_UTCL1_STALL_MULTI_MISS
|
||||
- Total UTCL1 stalls due to arbitrated multiple misses.
|
||||
|
||||
* - TCP_UTCL1_LFIFO_FULL
|
||||
- Total UTCL1 and UTCL2 latency, which hides FIFO full cycles.
|
||||
|
||||
* - TCP_UTCL1_STALL_LFIFO_NOT_RES
|
||||
- Total UTCL1 stalls due to UTCL2 latency, which hides FIFO output (not resident).
|
||||
|
||||
* - TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS
|
||||
- Total UTCL1 stalls due to UTCL2_req being out of credits.
|
||||
|
||||
* - TCP_CLIENT_UTCL1_INFLIGHT
|
||||
- The sum of inflight client to UTCL1 requests per cycle.
|
||||
|
||||
* - TCP_TAGRAM0_REQ
|
||||
- Total L2 requests mapping to TagRAM 0 from this TCP to all TCCs.
|
||||
|
||||
* - TCP_TAGRAM1_REQ
|
||||
- Total L2 requests mapping to TagRAM 1 from this TCP to all TCCs.
|
||||
|
||||
* - TCP_TAGRAM2_REQ
|
||||
- Total L2 requests mapping to TagRAM 2 from this TCP to all TCCs.
|
||||
|
||||
* - TCP_TAGRAM3_REQ
|
||||
- Total L2 requests mapping to TagRAM 3 from this TCP to all TCCs.
|
||||
|
||||
* - TCP_TCP_LATENCY
|
||||
- Total TCP wave latency (from the first clock of wave entering to the first clock of wave leaving). Divide by TA_TCP_STATE_READ to find average wave latency.
|
||||
|
||||
* - TCP_TCC_READ_REQ_LATENCY
|
||||
- Total TCP to TCC request latency for reads and atomics with return. Not Windowed.
|
||||
|
||||
* - TCP_TCC_WRITE_REQ_LATENCY
|
||||
- Total TCP to TCC request latency for writes and atomics without return. Not Windowed.
|
||||
|
||||
* - TCP_TCC_WRITE_REQ_HOLE_LATENCY
|
||||
- Total TCP req to TCC hole latency for writes and atomics. Not Windowed.
|
||||
|
||||
Texture cache per channel (TCC) counters
|
||||
=========================================
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Hardware counter
|
||||
- Definition
|
||||
|
||||
* - TCC_READ_SECTORS
|
||||
- Total number of 32B data sectors in read requests.
|
||||
|
||||
* - TCC_WRITE_SECTORS
|
||||
- Total number of 32B data sectors in write requests.
|
||||
|
||||
* - TCC_ATOMIC_SECTORS
|
||||
- Total number of 32B data sectors in atomic requests.
|
||||
|
||||
* - TCC_BYPASS_REQ
|
||||
- Number of bypass requests. This is measured at the tag block.
|
||||
|
||||
* - TCC_LATENCY_FIFO_FULL
|
||||
- Number of cycles when the latency FIFO is full.
|
||||
|
||||
* - TCC_SRC_FIFO_FULL
|
||||
- Number of cycles when the SRC FIFO is assumed to be full as measured at the IB block.
|
||||
|
||||
* - TCC_EA0_RDREQ_64B
|
||||
- Number of 64-byte TCC/EA read requests.
|
||||
|
||||
* - TCC_EA0_RDREQ_128B
|
||||
- Number of 128-byte TCC/EA read requests.
|
||||
|
||||
* - TCC_IB_REQ
|
||||
- Number of requests through the IB. This measures the number of raw requests from graphics clients to this TCC.
|
||||
|
||||
* - TCC_IB_STALL
|
||||
- Number of cycles when the IB output is stalled.
|
||||
|
||||
* - TCC_EA0_WRREQ_WRITE_DRAM
|
||||
- Number of TCC/EA write requests (32-byte or 64-byte) destined for DRAM (MC).
|
||||
|
||||
* - TCC_EA0_WRREQ_ATOMIC_DRAM
|
||||
- Number of TCC/EA atomic requests (32-byte or 64-byte) destined for DRAM (MC).
|
||||
|
||||
* - TCC_EA0_RDREQ_DRAM_32B
|
||||
- Number of 32-byte TCC/EA read requests due to DRAM traffic. One 64-byte request is counted as two and one 128-byte as four.
|
||||
|
||||
* - TCC_EA0_RDREQ_GMI_32B
|
||||
- Number of 32-byte TCC/EA read requests due to GMI traffic. One 64-byte request is counted as two and one 128-byte as four.
|
||||
|
||||
* - TCC_EA0_RDREQ_IO_32B
|
||||
- Number of 32-byte TCC/EA read requests due to IO traffic. One 64-byte request is counted as two and one 128-byte as four.
|
||||
|
||||
* - TCC_EA0_WRREQ_WRITE_DRAM_32B
|
||||
- Number of 32-byte TCC/EA write requests due to DRAM traffic. One 64-byte request is counted as two.
|
||||
|
||||
* - TCC_EA0_WRREQ_ATOMIC_DRAM_32B
|
||||
- Number of 32-byte TCC/EA atomic requests due to DRAM traffic. One 64-byte request is counted as two.
|
||||
|
||||
* - TCC_EA0_WRREQ_WRITE_GMI_32B
|
||||
- Number of 32-byte TCC/EA write requests due to GMI traffic. One 64-byte request is counted as two.
|
||||
|
||||
* - TCC_EA0_WRREQ_ATOMIC_GMI_32B
|
||||
- Number of 32-byte TCC/EA atomic requests due to GMI traffic. One 64-byte request is counted as two.
|
||||
|
||||
* - TCC_EA0_WRREQ_WRITE_IO_32B
|
||||
- Number of 32-byte TCC/EA write requests due to IO traffic. One 64-byte request is counted as two.
|
||||
|
||||
* - TCC_EA0_WRREQ_ATOMIC_IO_32B
|
||||
- Number of 32-byte TCC/EA atomic requests due to IO traffic. One 64-byte request is counted as two.
|
||||
@@ -1,116 +0,0 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="description" content="GPU isolation techniques">
|
||||
<meta name="keywords" content="GPU isolation techniques, UUID, universally unique identifier,
|
||||
environment variables, virtual machines, AMD, ROCm">
|
||||
</head>
|
||||
|
||||
# GPU isolation techniques
|
||||
|
||||
Restricting the access of applications to a subset of GPUs, aka isolating
|
||||
GPUs allows users to hide GPU resources from programs. The programs by default
|
||||
will only use the "exposed" GPUs ignoring other (hidden) GPUs in the system.
|
||||
|
||||
There are multiple ways to achieve isolation of GPUs in the ROCm software stack,
|
||||
differing in which applications they apply to and the security they provide.
|
||||
This page serves as an overview of the techniques.
|
||||
|
||||
## Environment variables
|
||||
|
||||
The runtimes in the ROCm software stack read these environment variables to
|
||||
select the exposed or default device to present to applications using them.
|
||||
|
||||
Environment variables shouldn't be used for isolating untrusted applications,
|
||||
as an application can reset them before initializing the runtime.
|
||||
|
||||
### `ROCR_VISIBLE_DEVICES`
|
||||
|
||||
A list of device indices or {abbr}`UUID (universally unique identifier)`s
|
||||
that will be exposed to applications.
|
||||
|
||||
Runtime
|
||||
: ROCm Software Runtime. Applies to all applications using the user mode ROCm
|
||||
software stack.
|
||||
|
||||
```{code-block} shell
|
||||
:caption: Example to expose the 1. device and a device based on UUID.
|
||||
export ROCR_VISIBLE_DEVICES="0,GPU-DEADBEEFDEADBEEF"
|
||||
```
|
||||
|
||||
### `GPU_DEVICE_ORDINAL`
|
||||
|
||||
Devices indices exposed to OpenCL and HIP applications.
|
||||
|
||||
Runtime
|
||||
: ROCm Compute Language Runtime (`ROCclr`). Applies to applications and runtimes
|
||||
using the `ROCclr` abstraction layer including HIP and OpenCL applications.
|
||||
|
||||
```{code-block} shell
|
||||
:caption: Example to expose the 1. and 3. device in the system.
|
||||
export GPU_DEVICE_ORDINAL="0,2"
|
||||
```
|
||||
|
||||
(hip_visible_devices)=
|
||||
|
||||
### `HIP_VISIBLE_DEVICES`
|
||||
|
||||
Device indices exposed to HIP applications.
|
||||
|
||||
Runtime: HIP runtime. Applies only to applications using HIP on the AMD platform.
|
||||
|
||||
```{code-block} shell
|
||||
:caption: Example to expose the 1. and 3. devices in the system.
|
||||
export HIP_VISIBLE_DEVICES="0,2"
|
||||
```
|
||||
|
||||
### `CUDA_VISIBLE_DEVICES`
|
||||
|
||||
Provided for CUDA compatibility, has the same effect as `HIP_VISIBLE_DEVICES`
|
||||
on the AMD platform.
|
||||
|
||||
Runtime
|
||||
: HIP or CUDA Runtime. Applies to HIP applications on the AMD or NVIDIA platform
|
||||
and CUDA applications.
|
||||
|
||||
### `OMP_DEFAULT_DEVICE`
|
||||
|
||||
Default device used for OpenMP target offloading.
|
||||
|
||||
Runtime
|
||||
: OpenMP Runtime. Applies only to applications using OpenMP offloading.
|
||||
|
||||
```{code-block} shell
|
||||
:caption: Example on setting the default device to the third device.
|
||||
export OMP_DEFAULT_DEVICE="2"
|
||||
```
|
||||
|
||||
## Docker
|
||||
|
||||
Docker uses Linux kernel namespaces to provide isolated environments for
|
||||
applications. This isolation applies to most devices by default, including
|
||||
GPUs. To access them in containers explicit access must be granted, please see
|
||||
{ref}`docker-access-gpus-in-container` for details.
|
||||
Specifically refer to {ref}`docker-restrict-gpus` on exposing just a subset
|
||||
of all GPUs.
|
||||
|
||||
Docker isolation is more secure than environment variables, and applies
|
||||
to all programs that use the `amdgpu` kernel module interfaces.
|
||||
Even programs that don't use the ROCm runtime, like graphics applications
|
||||
using OpenGL or Vulkan, can only access the GPUs exposed to the container.
|
||||
|
||||
## GPU passthrough to virtual machines
|
||||
|
||||
Virtual machines achieve the highest level of isolation, because even the kernel
|
||||
of the virtual machine is isolated from the host. Devices physically installed
|
||||
in the host system can be passed to the virtual machine using PCIe passthrough.
|
||||
This allows for using the GPU with a different operating systems like a Windows
|
||||
guest from a Linux host.
|
||||
|
||||
Setting up PCIe passthrough is specific to the hypervisor used. ROCm officially
|
||||
supports [VMware ESXi](https://www.vmware.com/products/esxi-and-esx.html)
|
||||
for select GPUs.
|
||||
|
||||
<!--
|
||||
TODO: This should link to a page about virtualization that explains
|
||||
pass-through and SR-IOV and how-tos for maybe `libvirt` and `VMWare`
|
||||
-->
|
||||
341
docs/conf.py
@@ -9,66 +9,85 @@ import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
gh_release_path = os.path.join("..", "RELEASE.md")
|
||||
gh_changelog_path = os.path.join("..", "CHANGELOG.md")
|
||||
sphinx_release_path = os.path.join("about", "release-notes.md")
|
||||
sphinx_changelog_path = os.path.join("release", "changelog.md")
|
||||
shutil.copy2(gh_release_path, sphinx_release_path)
|
||||
shutil.copy2(gh_changelog_path, sphinx_changelog_path)
|
||||
ROCM_VERSION = "7.9.0"
|
||||
GA_DATE = "2025-10-20"
|
||||
|
||||
DOCS_DIR = Path(__file__).parent.resolve()
|
||||
ROOT_DIR = DOCS_DIR.parent
|
||||
|
||||
|
||||
def copy_rtd_file(src_path: Path, dest_path: Path):
|
||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(src_path, dest_path)
|
||||
print(f"📁 Copied {src_path} → {dest_path}")
|
||||
|
||||
|
||||
# compat_matrix_src = DOCS_DIR / "compatibility" / "compatibility-matrix-historical-6.0.csv" # fmt: skip
|
||||
# compat_matrix_dest = ROOT_DIR / "_readthedocs" / "html" / "downloads" / "compatibility-matrix-historical-6.0.csv" # fmt: skip
|
||||
# copy_rtd_file(compat_matrix_src, compat_matrix_dest)
|
||||
|
||||
gh_release_path = ROOT_DIR / "RELEASE.md"
|
||||
rtd_release_path = DOCS_DIR / "about" / "release-notes.md"
|
||||
copy_rtd_file(gh_release_path, rtd_release_path)
|
||||
|
||||
# gh_changelog_path = ROOT_DIR / "CHANGELOG.md"
|
||||
# rtd_changelog_path = DOCS_DIR / "release" / "changelog.md"
|
||||
# copy_rtd_file(gh_changelog_path, rtd_changelog_path)
|
||||
|
||||
|
||||
# Mark the consolidated changelog as orphan to prevent Sphinx from warning about missing toctree entries
|
||||
with open(sphinx_changelog_path, "r+", encoding="utf-8") as file:
|
||||
content = file.read()
|
||||
file.seek(0)
|
||||
file.write(":orphan:\n" + content)
|
||||
# with open(rtd_changelog_path, "r+", encoding="utf-8") as file:
|
||||
# content = file.read()
|
||||
# file.seek(0)
|
||||
# file.write(":orphan:\n" + content)
|
||||
#
|
||||
# # Replace GitHub-style [!ADMONITION]s with Sphinx-compatible ```{admonition} blocks
|
||||
# with open(rtd_changelog_path, "r", encoding="utf-8") as file:
|
||||
# lines = file.readlines()
|
||||
#
|
||||
# modified_lines = []
|
||||
# in_admonition_section = False
|
||||
#
|
||||
# # Map for matching the specific admonition type to its corresponding Sphinx markdown syntax
|
||||
# admonition_types = {
|
||||
# "> [!NOTE]": "```{note}",
|
||||
# "> [!TIP]": "```{tip}",
|
||||
# "> [!IMPORTANT]": "```{important}",
|
||||
# "> [!WARNING]": "```{warning}",
|
||||
# "> [!CAUTION]": "```{caution}",
|
||||
# }
|
||||
#
|
||||
# for line in lines:
|
||||
# if any(line.startswith(k) for k in admonition_types):
|
||||
# for key in admonition_types:
|
||||
# if line.startswith(key):
|
||||
# modified_lines.append(admonition_types[key] + "\n")
|
||||
# break
|
||||
# in_admonition_section = True
|
||||
# elif in_admonition_section:
|
||||
# if line.strip() == "":
|
||||
# # If we encounter an empty line, close the admonition section
|
||||
# modified_lines.append("```\n\n") # Close the admonition block
|
||||
# in_admonition_section = False
|
||||
# else:
|
||||
# modified_lines.append(line.lstrip("> "))
|
||||
# else:
|
||||
# modified_lines.append(line)
|
||||
#
|
||||
# # In case the file ended while still in a admonition section, close it
|
||||
# if in_admonition_section:
|
||||
# modified_lines.append("```")
|
||||
#
|
||||
# file.close()
|
||||
#
|
||||
# with open(rtd_changelog_path, "w", encoding="utf-8") as file:
|
||||
# file.writelines(modified_lines)
|
||||
|
||||
# Replace GitHub-style [!ADMONITION]s with Sphinx-compatible ```{admonition} blocks
|
||||
with open(sphinx_changelog_path, "r", encoding="utf-8") as file:
|
||||
lines = file.readlines()
|
||||
|
||||
modified_lines = []
|
||||
in_admonition_section = False
|
||||
|
||||
# Map for matching the specific admonition type to its corresponding Sphinx markdown syntax
|
||||
admonition_types = {
|
||||
'> [!NOTE]': '```{note}',
|
||||
'> [!TIP]': '```{tip}',
|
||||
'> [!IMPORTANT]': '```{important}',
|
||||
'> [!WARNING]': '```{warning}',
|
||||
'> [!CAUTION]': '```{caution}'
|
||||
}
|
||||
|
||||
for line in lines:
|
||||
if any(line.startswith(k) for k in admonition_types):
|
||||
for key in admonition_types:
|
||||
if(line.startswith(key)):
|
||||
modified_lines.append(admonition_types[key] + '\n')
|
||||
break
|
||||
in_admonition_section = True
|
||||
elif in_admonition_section:
|
||||
if line.strip() == '':
|
||||
# If we encounter an empty line, close the admonition section
|
||||
modified_lines.append('```\n\n') # Close the admonition block
|
||||
in_admonition_section = False
|
||||
else:
|
||||
modified_lines.append(line.lstrip('> '))
|
||||
else:
|
||||
modified_lines.append(line)
|
||||
|
||||
# In case the file ended while still in a admonition section, close it
|
||||
if in_admonition_section:
|
||||
modified_lines.append('```')
|
||||
|
||||
file.close()
|
||||
|
||||
with open(sphinx_changelog_path, "w", encoding="utf-8") as file:
|
||||
file.writelines(modified_lines)
|
||||
|
||||
matrix_path = os.path.join("compatibility", "compatibility-matrix-historical-6.0.csv")
|
||||
rtd_path = os.path.join("..", "_readthedocs", "html", "downloads")
|
||||
if not os.path.exists(rtd_path):
|
||||
os.makedirs(rtd_path)
|
||||
shutil.copy2(matrix_path, rtd_path)
|
||||
# matrix_path = os.path.join("compatibility", "compatibility-matrix-historical-6.0.csv")
|
||||
# rtd_path = os.path.join("..", "_readthedocs", "html", "downloads")
|
||||
# if not os.path.exists(rtd_path):
|
||||
# os.makedirs(rtd_path)
|
||||
# shutil.copy2(matrix_path, rtd_path)
|
||||
|
||||
latex_engine = "xelatex"
|
||||
latex_elements = {
|
||||
@@ -81,154 +100,108 @@ latex_elements = {
|
||||
|
||||
html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "rocm.docs.amd.com")
|
||||
html_context = {}
|
||||
if os.environ.get("READTHEDOCS", "") == "True":
|
||||
html_context["READTHEDOCS"] = True
|
||||
|
||||
# configurations for PDF output by Read the Docs
|
||||
project = "ROCm Documentation"
|
||||
project_path = os.path.abspath(".").replace("\\", "/")
|
||||
project_path = str(DOCS_DIR).replace("\\", "/")
|
||||
author = "Advanced Micro Devices, Inc."
|
||||
copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
|
||||
version = "7.0.0"
|
||||
release = "7.0.0"
|
||||
copyright = "Copyright (c) %Y Advanced Micro Devices, Inc. All rights reserved."
|
||||
version = ROCM_VERSION
|
||||
release = ROCM_VERSION
|
||||
setting_all_article_info = True
|
||||
all_article_info_os = ["linux", "windows"]
|
||||
all_article_info_author = ""
|
||||
|
||||
# pages with specific settings
|
||||
article_pages = [
|
||||
{"file": "about/release-notes", "os": ["linux"], "date": "2025-09-16"},
|
||||
{"file": "release/changelog", "os": ["linux"],},
|
||||
{"file": "compatibility/compatibility-matrix", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/tensorflow-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/jax-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/verl-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/stanford-megatron-lm-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/dgl-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/megablocks-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/taichi-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
|
||||
{"file": "how-to/deep-learning-rocm", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/model-quantization", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/optimizing-with-composable-kernel", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/optimizing-triton-kernel", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/profiling-and-debugging", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/workload", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/system-optimization/index", "os": ["linux"]},
|
||||
{"file": "how-to/system-optimization/mi300x", "os": ["linux"]},
|
||||
{"file": "how-to/system-optimization/mi200", "os": ["linux"]},
|
||||
{"file": "how-to/system-optimization/mi100", "os": ["linux"]},
|
||||
{"file": "how-to/system-optimization/w6000-v620", "os": ["linux"]},
|
||||
{"file": "how-to/tuning-guides/mi300x/index", "os": ["linux"]},
|
||||
{"file": "how-to/tuning-guides/mi300x/system", "os": ["linux"]},
|
||||
{"file": "how-to/tuning-guides/mi300x/workload", "os": ["linux"]},
|
||||
{"file": "how-to/system-debugging", "os": ["linux"]},
|
||||
{"file": "how-to/gpu-enabled-mpi", "os": ["linux"]},
|
||||
{"file": "about/release-notes", "date": GA_DATE},
|
||||
{"file": "rocm-for-ai/xdit-diffusion-inference", "os": ["linux"]},
|
||||
]
|
||||
|
||||
external_toc_path = "./sphinx/_toc.yml"
|
||||
|
||||
# Add the _extensions directory to Python's search path
|
||||
sys.path.append(str(Path(__file__).parent / 'extension'))
|
||||
# Register Sphinx extensions and static assets
|
||||
sys.path.append(str(DOCS_DIR / "extension"))
|
||||
|
||||
extensions = ["rocm_docs", "sphinx_reredirects", "sphinx_sitemap", "sphinxcontrib.datatemplates", "version-ref", "csv-to-list-table"]
|
||||
# "sphinx/static/css", "extension/how-to/rocm-for-ai/inference"]
|
||||
# html_css_files = [
|
||||
# "rocm_custom.css",
|
||||
# "rocm_rn.css",
|
||||
# "dynamic_picker.css",
|
||||
# "vllm-benchmark.css",
|
||||
# ]
|
||||
templates_path = ["extension/rocm_docs_custom/templates", "extension/templates"]
|
||||
|
||||
compatibility_matrix_file = str(Path(__file__).parent / 'compatibility/compatibility-matrix-historical-6.0.csv')
|
||||
extensions = [
|
||||
"rocm_docs",
|
||||
"rocm_docs_custom.selector",
|
||||
"rocm_docs_custom.table",
|
||||
"rocm_docs_custom.icon",
|
||||
"sphinxcontrib.datatemplates",
|
||||
# "sphinx_reredirects",
|
||||
# "sphinx_sitemap",
|
||||
# "version-ref",
|
||||
# "csv-to-list-table",
|
||||
]
|
||||
|
||||
# compatibility_matrix_file = str(
|
||||
# DOCS_DIR / "compatibility/compatibility-matrix-historical-6.0.csv"
|
||||
# )
|
||||
|
||||
external_projects_current_project = "rocm"
|
||||
|
||||
# Uncomment if facing rate limit exceed issue with local build
|
||||
# external_projects_remote_repository = ""
|
||||
|
||||
html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "https://rocm-stg.amd.com/")
|
||||
html_context = {}
|
||||
if os.environ.get("READTHEDOCS", "") == "True":
|
||||
html_context["READTHEDOCS"] = True
|
||||
|
||||
html_theme = "rocm_docs_theme"
|
||||
html_theme_options = {"flavor": "rocm-docs-home"}
|
||||
|
||||
html_static_path = ["sphinx/static/css", "extension/how-to/rocm-for-ai/inference"]
|
||||
html_css_files = ["rocm_custom.css", "rocm_rn.css", "vllm-benchmark.css"]
|
||||
html_theme_options = {
|
||||
"announcement": "This is ROCm 7.9.0 technology preview release documentation. For the latest production stream release, refer to <a id='rocm-banner' href='https://rocm.docs.amd.com/en/latest/'>ROCm documentation</a>.",
|
||||
"flavor": "generic",
|
||||
"header_title": "ROCm™ 7.9.0 Preview",
|
||||
"header_link": "https://rocm.docs.amd.com/en/7.9.0-preview/index.html",
|
||||
"version_list_link": "https://rocm.docs.amd.com/en/7.10.0-preview/release/versions.html",
|
||||
"nav_secondary_items": {
|
||||
"GitHub": "https://github.com/ROCm/ROCm",
|
||||
"Community": "https://github.com/ROCm/ROCm/discussions",
|
||||
"Blogs": "https://rocm.blogs.amd.com/",
|
||||
"Instinct™ Docs": "https://instinct.docs.amd.com/",
|
||||
"Support": "https://github.com/ROCm/ROCm/issues/new/choose",
|
||||
},
|
||||
"link_main_doc": False,
|
||||
"secondary_sidebar_items": {
|
||||
"**": ["page-toc"],
|
||||
"install/rocm": ["selector-toc2"],
|
||||
"compatibility/compatibility-matrix": ["selector-toc2"],
|
||||
}
|
||||
}
|
||||
html_title = f"AMD ROCm {ROCM_VERSION} preview"
|
||||
html_static_path = ["sphinx/static/css", "sphinx/static/js"]
|
||||
html_css_files = ["vllm-benchmark.css"]
|
||||
html_js_files = ["vllm-benchmark.js"]
|
||||
|
||||
html_title = "ROCm Documentation"
|
||||
|
||||
html_theme_options = {"link_main_doc": False}
|
||||
|
||||
redirects = {"reference/openmp/openmp": "../../about/compatibility/openmp.html"}
|
||||
|
||||
numfig = False
|
||||
suppress_warnings = ["autosectionlabel.*"]
|
||||
|
||||
html_context = {
|
||||
"project_path" : {project_path},
|
||||
"gpu_type" : [('AMD Instinct accelerators', 'intrinsic'), ('AMD gfx families', 'gfx'), ('NVIDIA families', 'nvidia') ],
|
||||
"atomics_type" : [('HW atomics', 'hw-atomics'), ('CAS emulation', 'cas-atomics')],
|
||||
"pcie_type" : [('No PCIe atomics', 'nopcie'), ('PCIe atomics', 'pcie')],
|
||||
"memory_type" : [('Device DRAM', 'device-dram'), ('Migratable Host DRAM', 'migratable-host-dram'), ('Pinned Host DRAM', 'pinned-host-dram')],
|
||||
"granularity_type" : [('Coarse-grained', 'coarse-grained'), ('Fine-grained', 'fine-grained')],
|
||||
"scope_type" : [('Device', 'device'), ('System', 'system')]
|
||||
}
|
||||
# html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "https://rocm-stg.amd.com/")
|
||||
# html_context = {
|
||||
# "project_path": {project_path},
|
||||
# "gpu_type": [
|
||||
# ("AMD Instinct accelerators", "intrinsic"),
|
||||
# ("AMD gfx families", "gfx"),
|
||||
# ("NVIDIA families", "nvidia"),
|
||||
# ],
|
||||
# "atomics_type": [("HW atomics", "hw-atomics"), ("CAS emulation", "cas-atomics")],
|
||||
# "pcie_type": [("No PCIe atomics", "nopcie"), ("PCIe atomics", "pcie")],
|
||||
# "memory_type": [
|
||||
# ("Device DRAM", "device-dram"),
|
||||
# ("Migratable Host DRAM", "migratable-host-dram"),
|
||||
# ("Pinned Host DRAM", "pinned-host-dram"),
|
||||
# ],
|
||||
# "granularity_type": [
|
||||
# ("Coarse-grained", "coarse-grained"),
|
||||
# ("Fine-grained", "fine-grained"),
|
||||
# ],
|
||||
# "scope_type": [("Device", "device"), ("System", "system")],
|
||||
# }
|
||||
if os.environ.get("READTHEDOCS", "") == "True":
|
||||
html_context["READTHEDOCS"] = True
|
||||
|
||||
# temporary settings to speed up docs build for faster iteration
|
||||
# external_projects_remote_repository = ""
|
||||
# external_toc_exclude_missing = True
|
||||
|
||||
@@ -1,168 +0,0 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="description" content="Building ROCm documentation">
|
||||
<meta name="keywords" content="documentation, Visual Studio Code, GitHub, command line,
|
||||
AMD, ROCm">
|
||||
</head>
|
||||
|
||||
# Building documentation
|
||||
|
||||
## GitHub
|
||||
|
||||
If you open a pull request and scroll down to the summary panel,
|
||||
there is a commit status section. Next to the line
|
||||
`docs/readthedocs.com:advanced-micro-devices-demo`, there is a `Details` link.
|
||||
If you click this, it takes you to the Read the Docs build for your pull request.
|
||||
|
||||

|
||||
|
||||
If you don't see this line, click `Show all checks` to get an itemized view.
|
||||
|
||||
## Command line
|
||||
|
||||
You can build our documentation via the command line using Python.
|
||||
|
||||
See the `build.tools.python` setting in the [Read the Docs configuration file](https://github.com/ROCm/ROCm/blob/develop/.readthedocs.yaml) for the Python version used by Read the Docs to build documentation.
|
||||
|
||||
See the [Python requirements file](https://github.com/ROCm/ROCm/blob/develop/docs/sphinx/requirements.txt) for Python packages needed to build the documentation.
|
||||
|
||||
Use the Python Virtual Environment (`venv`) and run the following commands from the project root:
|
||||
|
||||
::::{tab-set}
|
||||
:::{tab-item} Linux and WSL
|
||||
:sync: linux
|
||||
|
||||
```sh
|
||||
python3 -mvenv .venv
|
||||
|
||||
.venv/bin/python -m pip install -r docs/sphinx/requirements.txt
|
||||
.venv/bin/python -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
|
||||
```
|
||||
|
||||
:::
|
||||
:::{tab-item} Windows
|
||||
:sync: windows
|
||||
|
||||
```powershell
|
||||
python -mvenv .venv
|
||||
|
||||
.venv\Scripts\python.exe -m pip install -r docs/sphinx/requirements.txt
|
||||
.venv\Scripts\python.exe -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
|
||||
```
|
||||
|
||||
:::
|
||||
::::
|
||||
|
||||
Navigate to `_build/html/index.html` and open this file in a web browser.
|
||||
|
||||
## Visual Studio Code
|
||||
|
||||
With the help of a few extensions, you can create a productive environment to author and test
|
||||
documentation locally using Visual Studio (VS) Code. Follow these steps to configure VS Code:
|
||||
|
||||
1. Install the required extensions:
|
||||
|
||||
* Python: `(ms-python.python)`
|
||||
* Live Server: `(ritwickdey.LiveServer)`
|
||||
|
||||
2. Add the following entries to `.vscode/settings.json`.
|
||||
|
||||
```json
|
||||
{
|
||||
"liveServer.settings.root": "/.vscode/build/html",
|
||||
"liveServer.settings.wait": 1000,
|
||||
"python.terminal.activateEnvInCurrentTerminal": true
|
||||
}
|
||||
```
|
||||
|
||||
* `liveServer.settings.root`: Sets the root of the output website for live previews. Must be changed
|
||||
alongside the `tasks.json` command.
|
||||
* `liveServer.settings.wait`: Tells the live server to wait with the update in order to give Sphinx time to
|
||||
regenerate the site contents and not refresh before the build is complete.
|
||||
* `python.terminal.activateEnvInCurrentTerminal`: Activates the automatic virtual environment, so you
|
||||
can build the site from the integrated terminal.
|
||||
|
||||
3. Add the following tasks to `.vscode/tasks.json`.
|
||||
|
||||
```json
|
||||
{
|
||||
"version": "2.0.0",
|
||||
"tasks": [
|
||||
{
|
||||
"label": "Build Docs",
|
||||
"type": "process",
|
||||
"windows": {
|
||||
"command": "${workspaceFolder}/.venv/Scripts/python.exe"
|
||||
},
|
||||
"command": "${workspaceFolder}/.venv/bin/python3",
|
||||
"args": [
|
||||
"-m",
|
||||
"sphinx",
|
||||
"-j",
|
||||
"auto",
|
||||
"-T",
|
||||
"-b",
|
||||
"html",
|
||||
"-d",
|
||||
"${workspaceFolder}/.vscode/build/doctrees",
|
||||
"-D",
|
||||
"language=en",
|
||||
"${workspaceFolder}/docs",
|
||||
"${workspaceFolder}/.vscode/build/html"
|
||||
],
|
||||
"problemMatcher": [
|
||||
{
|
||||
"owner": "sphinx",
|
||||
"fileLocation": "absolute",
|
||||
"pattern": {
|
||||
"regexp": "^(?:.*\\.{3}\\s+)?(\\/[^:]*|[a-zA-Z]:\\\\[^:]*):(\\d+):\\s+(WARNING|ERROR):\\s+(.*)$",
|
||||
"file": 1,
|
||||
"line": 2,
|
||||
"severity": 3,
|
||||
"message": 4
|
||||
}
|
||||
},
|
||||
{
|
||||
"owner": "sphinx",
|
||||
"fileLocation": "absolute",
|
||||
"pattern": {
|
||||
"regexp": "^(?:.*\\.{3}\\s+)?(\\/[^:]*|[a-zA-Z]:\\\\[^:]*):{1,2}\\s+(WARNING|ERROR):\\s+(.*)$",
|
||||
"file": 1,
|
||||
"severity": 2,
|
||||
"message": 3
|
||||
}
|
||||
}
|
||||
],
|
||||
"group": {
|
||||
"kind": "build",
|
||||
"isDefault": true
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
> Implementation detail: two problem matchers were needed to be defined,
|
||||
> because VS Code doesn't tolerate some problem information being potentially
|
||||
> absent. While a single regex could match all types of errors, if a capture
|
||||
> group remains empty (the line number doesn't show up in all warning/error
|
||||
> messages) but the `pattern` references said empty capture group, VS Code
|
||||
> discards the message completely.
|
||||
|
||||
4. Configure the Python virtual environment (`venv`).
|
||||
|
||||
From the Command Palette, run `Python: Create Environment`. Select `venv` environment and
|
||||
`docs/sphinx/requirements.txt`.
|
||||
|
||||
5. Build the docs.
|
||||
|
||||
Launch the default build task using one of the following options:
|
||||
|
||||
* A hotkey (the default is `Ctrl+Shift+B`)
|
||||
* Issuing the `Tasks: Run Build Task` from the Command Palette
|
||||
|
||||
6. Open the live preview.
|
||||
|
||||
Navigate to the site output within VS Code: right-click on `.vscode/build/html/index.html` and
|
||||
select `Open with Live Server`. The contents should update on every rebuild without having to
|
||||
refresh the browser.
|
||||
@@ -1,77 +0,0 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="description" content="Contributing to ROCm">
|
||||
<meta name="keywords" content="ROCm, contributing, contribute, maintainer, contributor">
|
||||
</head>
|
||||
|
||||
# Contributing to the ROCm documentation
|
||||
|
||||
The ROCm documentation, like all of ROCm, is open source and available on GitHub. You can contribute to the ROCm documentation by forking the appropriate repository, making your changes, and opening a pull request.
|
||||
|
||||
To provide feedback on the ROCm documentation, including submitting an issue or suggesting a feature, see [Providing feedback about the ROCm documentation](./feedback.md).
|
||||
|
||||
## The ROCm repositories
|
||||
|
||||
The repositories for ROCm and all ROCm components are available on GitHub.
|
||||
|
||||
| Module | Documentation location |
|
||||
| --- | --- |
|
||||
| ROCm framework | [https://github.com/ROCm/ROCm/tree/develop/docs](https://github.com/ROCm/ROCm/tree/develop/docs) |
|
||||
| ROCm installation for Linux | [https://github.com/ROCm/rocm-install-on-linux/tree/develop/docs](https://github.com/ROCm/rocm-install-on-linux/tree/develop/docs) |
|
||||
| ROCm HIP SDK installation for Windows | [https://github.com/ROCm/rocm-install-on-windows/tree/develop/docs](https://github.com/ROCm/rocm-install-on-windows/tree/develop/docs) |
|
||||
|
||||
Individual components have their own repositories with their own documentation in their own `docs` folders.
|
||||
|
||||
The sub-folders within the `docs` folders across ROCm are typically structured as follows:
|
||||
|
||||
| Sub-folder name | Documentation type |
|
||||
|-------|----------|
|
||||
| `install` | Installation instructions, build instructions, and prerequisites |
|
||||
| `conceptual` | Important concepts |
|
||||
| `how-to` | How to implement specific use cases |
|
||||
| `tutorials` | Tutorials |
|
||||
| `reference` | API references and other reference resources |
|
||||
|
||||
## Editing and adding to the documentation
|
||||
|
||||
ROCm documentation follows the [Google developer documentation style guide](https://developers.google.com/style/highlights).
|
||||
|
||||
Most topics in the ROCm documentation are written in [reStructuredText (rst)](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html), with some topics written in Markdown. Only use reStructuredText when adding new topics. Only use Markdown if the topic you are editing is already in Markdown.
|
||||
|
||||
To edit or add to the documentation:
|
||||
|
||||
1. Fork the repository you want to add to or edit.
|
||||
2. Clone your fork locally.
|
||||
3. Create a new local branch cut from the `develop` branch of the repository.
|
||||
4. Make your changes to the documentation.
|
||||
|
||||
5. Optionally, build the documentation locally before creating a pull request by running the following commands from within the `docs` folder:
|
||||
|
||||
```bash
|
||||
pip3 install -r sphinx/requirements.txt # You only need to run this command once
|
||||
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
|
||||
```
|
||||
|
||||
The output files will be located in the `docs/_build` folder. Open `docs/_build/html/index.html` to view the documentation.
|
||||
|
||||
For more information on ROCm build tools, see [Documentation toolchain](toolchain.md).
|
||||
6. Push your changes. A GitHub link will be returned in the output of the `git push` command. Open this link in a browser to create the pull request.
|
||||
|
||||
The documentation is built as part of the checks on pull request, along with spell checking and linting. Scroll to the bottom of your pull request to view all the checks.
|
||||
|
||||
Verify that the linting and spell checking have passed, and that the documentation was built successfully. New words or acronyms can be added to the [wordlist file](https://github.com/ROCm/rocm-docs-core/blob/develop/.wordlist.txt). The wordlist is subject to approval by the ROCm documentation team.
|
||||
|
||||
The Read The Docs build of your pull request can be accessed by clicking on the Details link next to the Read The Docs build check. Verify that your changes are in the build and look as expected.
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
Your pull request will be reviewed by a member of the ROCm documentation team.
|
||||
|
||||
See the [GitHub documentation](https://docs.github.com/en) for information on how to fork and clone a repository, and how to create and push a local branch.
|
||||
|
||||
```{important}
|
||||
By creating a pull request (PR), you agree to allow your contribution to be licensed under the terms of the
|
||||
LICENSE.txt file in the corresponding repository. Different repositories can use different licenses.
|
||||
```
|
||||
@@ -1,27 +0,0 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="description" content="Providing feedback for ROCm documentation">
|
||||
<meta name="keywords" content="documentation, pull request, GitHub, AMD, ROCm">
|
||||
</head>
|
||||
|
||||
# Providing feedback about the ROCm documentation
|
||||
|
||||
Feedback about the ROCm documentation is welcome. You can provide feedback about the ROCm documentation either through GitHub Discussions or GitHub Issues.
|
||||
|
||||
## Participating in discussions through GitHub Discussions
|
||||
|
||||
You can ask questions, view announcements, suggest new features, and communicate with other members of the community through [GitHub Discussions](https://github.com/ROCm/ROCm/discussions).
|
||||
|
||||
## Submitting issues through GitHub Issues
|
||||
|
||||
You can submit issues through [GitHub Issues](https://github.com/ROCm/ROCm/issues).
|
||||
|
||||
When creating a new issue, follow the following guidelines:
|
||||
|
||||
1. Always do a search to see if the same issue already exists. If the issue already exists, upvote it, and comment or post to provide any additional details you might have.
|
||||
2. If you find an issue that is similar to your issue, log your issue, then add a comment that includes a link to the similar issue, as well as its issue number.
|
||||
3. Always provide as much information as possible. This helps reduce the time required to reproduce the issue.
|
||||
|
||||
After creating your issue, make sure to check it regularly for any requests for additional information.
|
||||
|
||||
For information about contributing content to the ROCm documentation, see [Contributing to the ROCm documentation](./contributing.md).
|
||||
@@ -1,46 +0,0 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="description" content="ROCm documentation toolchain">
|
||||
<meta name="keywords" content="documentation, toolchain, Sphinx, Doxygen, MyST, AMD, ROCm">
|
||||
</head>
|
||||
|
||||
# ROCm documentation toolchain
|
||||
|
||||
The ROCm documentation relies on several open source toolchains and sites.
|
||||
|
||||
## rocm-docs-core
|
||||
|
||||
[rocm-docs-core](https://github.com/ROCm/rocm-docs-core) is an AMD-maintained
|
||||
project that applies customizations for the ROCm documentation. This project is the tool most ROCm repositories use as part of their documentation build pipeline. It is available as a [pip package on PyPI](https://pypi.org/project/rocm-docs-core/).
|
||||
|
||||
See the user and developer guides for rocm-docs-core at
|
||||
{doc}`rocm-docs-core documentation<rocm-docs-core:index>`.
|
||||
|
||||
## Sphinx
|
||||
|
||||
[Sphinx](https://www.sphinx-doc.org/en/master/) is a documentation generator originally used for Python. It is now widely used in the open source community.
|
||||
|
||||
### Sphinx External ToC
|
||||
|
||||
[Sphinx External ToC](https://sphinx-external-toc.readthedocs.io/en/latest/intro.html) is a Sphinx extension used for ROCm documentation navigation. This tool generates a navigation menu on the left
|
||||
based on a YAML file (`_toc.yml.in`) that contains the table of contents.
|
||||
|
||||
### Sphinx-book-theme
|
||||
|
||||
[Sphinx-book-theme](https://sphinx-book-theme.readthedocs.io/en/latest/) is a Sphinx theme that defines the base appearance for ROCm documentation. ROCm documentation applies some customization, such as a custom header and footer, on top of the Sphinx Book Theme.
|
||||
|
||||
### Sphinx Design
|
||||
|
||||
[Sphinx design](https://sphinx-design.readthedocs.io/en/latest/index.html) is a Sphinx extension that adds design functionality. ROCm documentation uses Sphinx Design for grids, cards, and synchronized tabs.
|
||||
|
||||
## Doxygen
|
||||
|
||||
[Doxygen](https://www.doxygen.nl/) is a documentation generator that extracts information from in-code comments. It is used for API documentation.
|
||||
|
||||
## Breathe
|
||||
|
||||
[Breathe](https://www.breathe-doc.org/) is a Sphinx plugin for integrating Doxygen content.
|
||||
|
||||
## Read the Docs
|
||||
|
||||
[Read the Docs](https://docs.readthedocs.io/en/stable/) is the service that builds and hosts the HTML version of the ROCm documentation.
|
||||
BIN
docs/data/comfyui/chroma1-radiance-tti-card.png
Normal file
|
After Width: | Height: | Size: 167 KiB |
BIN
docs/data/comfyui/chroma1-radiance-tti-missing-models.png
Normal file
|
After Width: | Height: | Size: 47 KiB |
BIN
docs/data/comfyui/comfyui-main.png
Normal file
|
After Width: | Height: | Size: 778 KiB |
BIN
docs/data/comfyui/sd3_5-missing-models.png
Normal file
|
After Width: | Height: | Size: 39 KiB |
BIN
docs/data/comfyui/sd3_5-simple-card.png
Normal file
|
After Width: | Height: | Size: 171 KiB |
@@ -0,0 +1,188 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c
|
||||
components:
|
||||
ROCm: 6.4.1
|
||||
vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641)
|
||||
PyTorch: 2.7.0+gitf717b2a
|
||||
hipBLASLt: 0.15
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_vllm_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 4096
|
||||
max_num_batched_tokens: 4096
|
||||
max_model_len: 4096
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 32768
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 65536
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 32768
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 65536
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: QwQ-32B
|
||||
mad_tag: pyt_vllm_qwq-32b
|
||||
model_repo: Qwen/QwQ-32B
|
||||
url: https://huggingface.co/Qwen/QwQ-32B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||
model_repo: Qwen/Qwen3-30B-A3B
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 32768
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 16384
|
||||
max_num_batched_tokens: 16384
|
||||
max_model_len: 8192
|
||||
@@ -0,0 +1,32 @@
|
||||
dockers:
|
||||
- pull_tag: lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x
|
||||
docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.5.2rc1-rocm700-mi30x/images/sha256-10c4ee502ddba44dd8c13325e6e03868bfe7f43d23d0a44780a8ee8b393f4729
|
||||
components:
|
||||
ROCm: 7.0.0
|
||||
SGLang: v0.5.2rc1
|
||||
pytorch-triton-rocm: 3.4.0+rocm7.0.0.gitf9e5bf54
|
||||
model_groups:
|
||||
- group: Dense models
|
||||
tag: dense-models
|
||||
models:
|
||||
- model: Llama 3.1 8B Instruct
|
||||
model_repo: Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
|
||||
- model: Llama 3.1 405B FP8 KV
|
||||
model_repo: Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
- model: Llama 3.3 70B FP8 KV
|
||||
model_repo: amd-Llama-3.3-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
- model: Qwen3 32B
|
||||
model_repo: Qwen3-32B
|
||||
url: https://huggingface.co/Qwen/Qwen3-32B
|
||||
- group: Small experts models
|
||||
tag: small-experts-models
|
||||
models:
|
||||
- model: DeepSeek V3
|
||||
model_repo: DeepSeek-V3
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-V3
|
||||
- model: Mixtral 8x7B v0.1
|
||||
model_repo: Mixtral-8x7B-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
|
||||
@@ -1,188 +1,316 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c
|
||||
- pull_tag: rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5
|
||||
components:
|
||||
ROCm: 6.4.1
|
||||
vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641)
|
||||
PyTorch: 2.7.0+gitf717b2a
|
||||
hipBLASLt: 0.15
|
||||
ROCm: 7.0.0
|
||||
vLLM: 0.10.2 (0.11.0rc2.dev160+g790d22168.rocm700)
|
||||
PyTorch: 2.9.0a0+git1c57644
|
||||
hipBLASLt: 1.0.0
|
||||
dockerfile:
|
||||
commit: 790d22168820507f3105fef29596549378cfe399
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_vllm_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 4096
|
||||
max_num_batched_tokens: 4096
|
||||
max_model_len: 4096
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 4096
|
||||
max_model_len: 4096
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B MXFP4
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp4
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
precision: float4
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_vllm_llama-3.3-70b
|
||||
model_repo: meta-llama/Llama-3.3-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.3-70b_fp8
|
||||
model_repo: amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B MXFP4
|
||||
mad_tag: pyt_vllm_llama-3.3-70b_fp4
|
||||
model_repo: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
|
||||
url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-MXFP4-Preview
|
||||
precision: float4
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Scout 17Bx16E
|
||||
mad_tag: pyt_vllm_llama-4-scout-17b-16e
|
||||
model_repo: meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Maverick 17Bx128E
|
||||
mad_tag: pyt_vllm_llama-4-maverick-17b-128e
|
||||
model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Maverick 17Bx128E FP8
|
||||
mad_tag: pyt_vllm_llama-4-maverick-17b-128e_fp8
|
||||
model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek R1 0528 FP8
|
||||
mad_tag: pyt_vllm_deepseek-r1
|
||||
model_repo: deepseek-ai/DeepSeek-R1-0528
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_seqs: 1024
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: OpenAI GPT OSS
|
||||
tag: gpt-oss
|
||||
models:
|
||||
- model: GPT OSS 20B
|
||||
mad_tag: pyt_vllm_gpt-oss-20b
|
||||
model_repo: openai/gpt-oss-20b
|
||||
url: https://huggingface.co/openai/gpt-oss-20b
|
||||
precision: bfloat16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 8192
|
||||
max_model_len: 8192
|
||||
- model: GPT OSS 120B
|
||||
mad_tag: pyt_vllm_gpt-oss-120b
|
||||
model_repo: openai/gpt-oss-120b
|
||||
url: https://huggingface.co/openai/gpt-oss-120b
|
||||
precision: bfloat16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 8192
|
||||
max_model_len: 8192
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 32768
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 65536
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 32768
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 65536
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: QwQ-32B
|
||||
mad_tag: pyt_vllm_qwq-32b
|
||||
model_repo: Qwen/QwQ-32B
|
||||
url: https://huggingface.co/Qwen/QwQ-32B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||
model_repo: Qwen/Qwen3-30B-A3B
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 32768
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 8B
|
||||
mad_tag: pyt_vllm_qwen3-8b
|
||||
model_repo: Qwen/Qwen3-8B
|
||||
url: https://huggingface.co/Qwen/Qwen3-8B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 32B
|
||||
mad_tag: pyt_vllm_qwen3-32b
|
||||
model_repo: Qwen/Qwen3-32b
|
||||
url: https://huggingface.co/Qwen/Qwen3-32B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||
model_repo: Qwen/Qwen3-30B-A3B
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B FP8
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b_fp8
|
||||
model_repo: Qwen/Qwen3-30B-A3B-FP8
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B-FP8
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 235B A22B
|
||||
mad_tag: pyt_vllm_qwen3-235b-a22b
|
||||
model_repo: Qwen/Qwen3-235B-A22B
|
||||
url: https://huggingface.co/Qwen/Qwen3-235B-A22B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 235B A22B FP8
|
||||
mad_tag: pyt_vllm_qwen3-235b-a22b_fp8
|
||||
model_repo: Qwen/Qwen3-235B-A22B-FP8
|
||||
url: https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 16384
|
||||
max_num_batched_tokens: 16384
|
||||
max_model_len: 8192
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 16384
|
||||
max_model_len: 8192
|
||||
|
||||
@@ -1,12 +1,4 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/jax-training:maxtext-v25.7
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
|
||||
components:
|
||||
ROCm: 6.4.1
|
||||
JAX: 0.5.0
|
||||
Python: 3.10.12
|
||||
Transformer Engine: 2.1.0+90d703dd
|
||||
hipBLASLt: 1.x.x
|
||||
- pull_tag: rocm/jax-training:maxtext-v25.7-jax060
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
|
||||
components:
|
||||
@@ -15,6 +7,14 @@ dockers:
|
||||
Python: 3.10.12
|
||||
Transformer Engine: 2.1.0+90d703dd
|
||||
hipBLASLt: 1.1.0-499ece1c21
|
||||
- pull_tag: rocm/jax-training:maxtext-v25.7
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
|
||||
components:
|
||||
ROCm: 6.4.1
|
||||
JAX: 0.5.0
|
||||
Python: 3.10.12
|
||||
Transformer Engine: 2.1.0+90d703dd
|
||||
hipBLASLt: 1.x.x
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/megatron-lm:v25.7_py310
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
|
||||
- pull_tag: rocm/megatron-lm:v25.8_py310
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.8_py310/images/sha256-50fc824361054e445e86d5d88d5f58817f61f8ec83ad4a7e43ea38bbc4a142c0
|
||||
components:
|
||||
ROCm: 6.4.2
|
||||
Primus: v0.1.0-rc1
|
||||
ROCm: 6.4.3
|
||||
PyTorch: 2.8.0a0+gitd06a406
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.1.0.dev0+ba586519
|
||||
hipBLASLt: 37ba1d36
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
hipBLASLt: d1b517fc7a
|
||||
Triton: 3.3.0
|
||||
RCCL: 2.22.3
|
||||
model_groups:
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/megatron-lm:v25.7_py310
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
|
||||
components:
|
||||
ROCm: 6.4.2
|
||||
Primus: v0.1.0-rc1
|
||||
PyTorch: 2.8.0a0+gitd06a406
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.1.0.dev0+ba586519
|
||||
hipBLASLt: 37ba1d36
|
||||
Triton: 3.3.0
|
||||
RCCL: 2.22.3
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
|
||||
- model: Llama 3.1 70B (proxy)
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_megatron_lm_train_llama-2-7b
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-2-70b
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V3 (proxy)
|
||||
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
- model: DeepSeek-V2-Lite
|
||||
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
|
||||
- model: Mixtral 8x22B (proxy)
|
||||
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 2.5 7B
|
||||
mad_tag: pyt_megatron_lm_train_qwen2.5-7b
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: pyt_megatron_lm_train_qwen2.5-72b
|
||||
@@ -0,0 +1,58 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/megatron-lm:v25.7_py310
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
|
||||
components:
|
||||
ROCm: 6.4.2
|
||||
Primus: v0.1.0-rc1
|
||||
PyTorch: 2.8.0a0+gitd06a406
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.1.0.dev0+ba586519
|
||||
hipBLASLt: 37ba1d36
|
||||
Triton: 3.3.0
|
||||
RCCL: 2.22.3
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||
config_name: llama3.3_70B-pretrain.yaml
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||
config_name: llama3.1_70B-pretrain.yaml
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||
config_name: llama3.1_8B-pretrain.yaml
|
||||
- model: Llama 2 7B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
|
||||
config_name: llama2_7B-pretrain.yaml
|
||||
- model: Llama 2 70B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
|
||||
config_name: llama2_70B-pretrain.yaml
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V3 (proxy)
|
||||
mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
config_name: deepseek_v3-pretrain.yaml
|
||||
- model: DeepSeek-V2-Lite
|
||||
mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
config_name: deepseek_v2_lite-pretrain.yaml
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||
config_name: mixtral_8x7B_v0.1-pretrain.yaml
|
||||
- model: Mixtral 8x22B (proxy)
|
||||
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||
config_name: mixtral_8x22B_v0.1-pretrain.yaml
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 2.5 7B
|
||||
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
|
||||
config_name: primus_qwen2.5_7B-pretrain.yaml
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||
config_name: qwen2.5_72B-pretrain.yaml
|
||||
@@ -0,0 +1,162 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/pytorch-training:v25.7
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712
|
||||
components:
|
||||
ROCm: 6.4.2
|
||||
PyTorch: 2.8.0a0+gitd06a406
|
||||
Python: 3.10.18
|
||||
Transformer Engine: 2.2.0.dev0+94e53dd8
|
||||
Flash Attention: 3.0.0.post1
|
||||
hipBLASLt: 1.1.0-4b9a52edfc
|
||||
Triton: 3.3.0
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 4 Scout 17B-16E
|
||||
mad_tag: pyt_train_llama-4-scout-17b-16e
|
||||
model_repo: Llama-4-17B_16E
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_train_llama-3.3-70b
|
||||
model_repo: Llama-3.3-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||
- model: Llama 3.2 1B
|
||||
mad_tag: pyt_train_llama-3.2-1b
|
||||
model_repo: Llama-3.2-1B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-1B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.2 3B
|
||||
mad_tag: pyt_train_llama-3.2-3b
|
||||
model_repo: Llama-3.2-3B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-3B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.2 Vision 11B
|
||||
mad_tag: pyt_train_llama-3.2-vision-11b
|
||||
model_repo: Llama-3.2-Vision-11B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw]
|
||||
- model: Llama 3.2 Vision 90B
|
||||
mad_tag: pyt_train_llama-3.2-vision-90b
|
||||
model_repo: Llama-3.2-Vision-90B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw]
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: BF16
|
||||
training_modes: [pretrain, finetune_fw, finetune_lora]
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_train_llama-3.1-405b
|
||||
model_repo: Llama-3.1-405B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B
|
||||
precision: BF16
|
||||
training_modes: [finetune_qlora]
|
||||
- model: Llama 3 8B
|
||||
mad_tag: pyt_train_llama-3-8b
|
||||
model_repo: Llama-3-8B
|
||||
url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3 70B
|
||||
mad_tag: pyt_train_llama-3-70b
|
||||
model_repo: Llama-3-70B
|
||||
url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_train_llama-2-7b
|
||||
model_repo: Llama-2-7B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||
- model: Llama 2 13B
|
||||
mad_tag: pyt_train_llama-2-13b
|
||||
model_repo: Llama-2-13B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_train_llama-2-70b
|
||||
model_repo: Llama-2-70B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora, finetune_qlora]
|
||||
- group: OpenAI
|
||||
tag: openai
|
||||
models:
|
||||
- model: GPT OSS 20B
|
||||
mad_tag: pyt_train_gpt_oss_20b
|
||||
model_repo: GPT-OSS-20B
|
||||
url: https://huggingface.co/openai/gpt-oss-20b
|
||||
precision: BF16
|
||||
training_modes: [HF_finetune_lora]
|
||||
- model: GPT OSS 120B
|
||||
mad_tag: pyt_train_gpt_oss_120b
|
||||
model_repo: GPT-OSS-120B
|
||||
url: https://huggingface.co/openai/gpt-oss-120b
|
||||
precision: BF16
|
||||
training_modes: [HF_finetune_lora]
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 3 8B
|
||||
mad_tag: pyt_train_qwen3-8b
|
||||
model_repo: Qwen3-8B
|
||||
url: https://huggingface.co/Qwen/Qwen3-8B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Qwen 3 32B
|
||||
mad_tag: pyt_train_qwen3-32b
|
||||
model_repo: Qwen3-32
|
||||
url: https://huggingface.co/Qwen/Qwen3-32B
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora]
|
||||
- model: Qwen 2.5 32B
|
||||
mad_tag: pyt_train_qwen2.5-32b
|
||||
model_repo: Qwen2.5-32B
|
||||
url: https://huggingface.co/Qwen/Qwen2.5-32B
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora]
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: pyt_train_qwen2.5-72b
|
||||
model_repo: Qwen2.5-72B
|
||||
url: https://huggingface.co/Qwen/Qwen2.5-72B
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora]
|
||||
- model: Qwen 2 1.5B
|
||||
mad_tag: pyt_train_qwen2-1.5b
|
||||
model_repo: Qwen2-1.5B
|
||||
url: https://huggingface.co/Qwen/Qwen2-1.5B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Qwen 2 7B
|
||||
mad_tag: pyt_train_qwen2-7b
|
||||
model_repo: Qwen2-7B
|
||||
url: https://huggingface.co/Qwen/Qwen2-7B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- group: Flux
|
||||
tag: flux
|
||||
models:
|
||||
- model: FLUX.1-dev
|
||||
mad_tag: pyt_train_flux
|
||||
model_repo: Flux
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
precision: BF16
|
||||
training_modes: [pretrain]
|
||||
@@ -1,13 +1,13 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/megatron-lm:v25.7_py310
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
|
||||
- pull_tag: rocm/megatron-lm:v25.8_py310
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.8_py310/images/sha256-50fc824361054e445e86d5d88d5f58817f61f8ec83ad4a7e43ea38bbc4a142c0
|
||||
components:
|
||||
ROCm: 6.4.2
|
||||
Primus: v0.1.0-rc1
|
||||
ROCm: 6.4.3
|
||||
Primus: 927a717
|
||||
PyTorch: 2.8.0a0+gitd06a406
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.1.0.dev0+ba586519
|
||||
hipBLASLt: 37ba1d36
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
hipBLASLt: d1b517fc7a
|
||||
Triton: 3.3.0
|
||||
RCCL: 2.22.3
|
||||
model_groups:
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/pytorch-training:v25.8
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5
|
||||
components:
|
||||
ROCm: 6.4.3
|
||||
PyTorch: 2.8.0a0+gitd06a406
|
||||
Python: 3.10.18
|
||||
Transformer Engine: 2.2.0.dev0+a1e66aae
|
||||
Flash Attention: 3.0.0.post1
|
||||
hipBLASLt: 1.1.0-d1b517fc7a
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: primus_pyt_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: primus_pyt_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B
|
||||
precision: BF16
|
||||
@@ -1,14 +1,13 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/pytorch-training:v25.7
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712
|
||||
- pull_tag: rocm/pytorch-training:v25.8
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5
|
||||
components:
|
||||
ROCm: 6.4.2
|
||||
ROCm: 6.4.3
|
||||
PyTorch: 2.8.0a0+gitd06a406
|
||||
Python: 3.10.18
|
||||
Transformer Engine: 2.2.0.dev0+94e53dd8
|
||||
Transformer Engine: 2.2.0.dev0+a1e66aae
|
||||
Flash Attention: 3.0.0.post1
|
||||
hipBLASLt: 1.1.0-4b9a52edfc
|
||||
Triton: 3.3.0
|
||||
hipBLASLt: 1.1.0-d1b517fc7a
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
@@ -151,6 +150,15 @@ model_groups:
|
||||
url: https://huggingface.co/Qwen/Qwen2-7B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- group: Stable Diffusion
|
||||
tag: sd
|
||||
models:
|
||||
- model: Stable Diffusion XL
|
||||
mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
|
||||
model_repo: SDXL
|
||||
url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora]
|
||||
- group: Flux
|
||||
tag: flux
|
||||
models:
|
||||
@@ -160,3 +168,11 @@ model_groups:
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
precision: BF16
|
||||
training_modes: [pretrain]
|
||||
- group: NCF
|
||||
tag: ncf
|
||||
models:
|
||||
- model: NCF
|
||||
mad_tag: pyt_ncf_training
|
||||
model_repo:
|
||||
url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
|
||||
precision: FP32
|
||||
|
||||
55
docs/data/rocm-for-ai/xdit-inference-models.yaml
Normal file
@@ -0,0 +1,55 @@
|
||||
xdit_diffusion_inference:
|
||||
docker:
|
||||
pull_tag: rocm/pytorch-xdit:v25.10
|
||||
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
|
||||
ROCm: 7.9.0
|
||||
components:
|
||||
TheRock: 7afbe45
|
||||
rccl: 9b04b2a
|
||||
composable_kernel: b7a806f
|
||||
rocm-libraries: f104555
|
||||
rocm-systems: 25922d0
|
||||
torch: 2.10.0a0+gite9c9017
|
||||
torchvision: 0.22.0a0+966da7e
|
||||
triton: 3.5.0+git52e49c12
|
||||
accelerate: 1.11.0.dev0
|
||||
aiter: 0.1.5.post4.dev20+ga25e55e79
|
||||
diffusers: 0.36.0.dev0
|
||||
xfuser: 0.4.4
|
||||
yunchang: 0.6.3.post1
|
||||
|
||||
model_groups:
|
||||
- group: Hunyuan Video
|
||||
tag: hunyuan
|
||||
models:
|
||||
- model: Hunyuan Video
|
||||
model_name: hunyuanvideo
|
||||
model_repo: tencent/HunyuanVideo
|
||||
revision: refs/pr/18
|
||||
url: https://huggingface.co/tencent/HunyuanVideo
|
||||
github: https://github.com/Tencent-Hunyuan/HunyuanVideo
|
||||
mad_tag: pyt_xdit_hunyuanvideo
|
||||
- group: Wan-AI
|
||||
tag: wan
|
||||
models:
|
||||
- model: Wan2.1
|
||||
model_name: wan2_1-i2v-14b-720p
|
||||
model_repo: Wan-AI/Wan2.1-I2V-14B-720P
|
||||
url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P
|
||||
github: https://github.com/Wan-Video/Wan2.1
|
||||
mad_tag: pyt_xdit_wan_2_1
|
||||
- model: Wan2.2
|
||||
model_name: wan2_2-i2v-a14b
|
||||
model_repo: Wan-AI/Wan2.2-I2V-A14B
|
||||
url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B
|
||||
github: https://github.com/Wan-Video/Wan2.2
|
||||
mad_tag: pyt_xdit_wan_2_2
|
||||
- group: FLUX
|
||||
tag: flux
|
||||
models:
|
||||
- model: FLUX.1
|
||||
model_name: FLUX.1-dev
|
||||
model_repo: black-forest-labs/FLUX.1-dev
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
github: https://github.com/black-forest-labs/flux
|
||||
mad_tag: pyt_xdit_flux
|
||||
BIN
docs/data/rocm-ontology.png
Normal file
|
After Width: | Height: | Size: 108 KiB |
BIN
docs/data/rocm-sdk-arch.png
Normal file
|
After Width: | Height: | Size: 46 KiB |
25
docs/extension/rocm_docs_custom/icon.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from docutils import nodes
|
||||
|
||||
|
||||
def icon_role(name, rawtext, text, lineno, inliner, options=None, content=None):
|
||||
"""
|
||||
Inline role for Font Awesome icons. See
|
||||
https://fontawesome.com/search?ip=brands&o=r to find available icons.
|
||||
|
||||
Example rST usage:
|
||||
|
||||
:icon:`fa-brands fa-redhat fa-lg`
|
||||
|
||||
Example MyST Markdown usage:
|
||||
|
||||
{icon}`fa-brands fa-redhat fa-lg`
|
||||
"""
|
||||
html = f'<i class="{text}"></i>'
|
||||
node = nodes.raw("", html, format="html")
|
||||
return [node], []
|
||||
|
||||
|
||||
def setup(app):
|
||||
app.add_role("icon", icon_role)
|
||||
|
||||
return {"version": "6.9", "parallel_read_safe": True}
|
||||
279
docs/extension/rocm_docs_custom/selector.py
Normal file
@@ -0,0 +1,279 @@
|
||||
from sphinx.util.docutils import SphinxDirective, directives, nodes
|
||||
from pathlib import Path
|
||||
from .utils import kv_to_data_attr, normalize_key
|
||||
import random
|
||||
import string
|
||||
|
||||
|
||||
class SelectorGroup(nodes.General, nodes.Element):
|
||||
"""
|
||||
A row within a selector container.
|
||||
|
||||
rST usage:
|
||||
|
||||
.. selector-group:: Heading
|
||||
:key:
|
||||
:show-when: os=ubuntu (list of key=value pairs separated by spaces)
|
||||
:heading-width: 4 (defaults to 6)
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def visit_html(translator, node):
|
||||
label = node["label"]
|
||||
key = node["key"]
|
||||
show_when_attr = kv_to_data_attr("show-when", node["show-when"])
|
||||
heading_width = node["heading-width"]
|
||||
icon = node["icon"]
|
||||
|
||||
icon_html = ""
|
||||
if icon:
|
||||
icon_html = f'<i class="rocm-docs-selector-icon {icon}"></i>'
|
||||
|
||||
translator.body.append(
|
||||
"<!-- start selector-group row -->"
|
||||
f"""
|
||||
<div id="{nodes.make_id(label)}" class="rocm-docs-selector-group row gx-0 pt-2"
|
||||
data-selector-key="{key}"
|
||||
{show_when_attr}
|
||||
role="radiogroup"
|
||||
aria-label="{label}"
|
||||
>
|
||||
<div class="col-{heading_width} me-1 px-2 rocm-docs-selector-group-heading">
|
||||
<span class="rocm-docs-selector-group-heading-text">{label}{icon_html}</span>
|
||||
</div>
|
||||
<div class="row col-{12 - heading_width} pe-0">
|
||||
""".strip()
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def depart_html(translator, _):
|
||||
translator.body.append(
|
||||
"""
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
"<!-- end selector-group row -->"
|
||||
)
|
||||
|
||||
|
||||
class SelectorGroupDirective(SphinxDirective):
|
||||
required_arguments = 1 # tile text
|
||||
final_argument_whitespace = True
|
||||
has_content = True
|
||||
option_spec = {
|
||||
"key": directives.unchanged,
|
||||
"show-when": directives.unchanged,
|
||||
"heading-width": directives.nonnegative_int,
|
||||
"icon": directives.unchanged,
|
||||
}
|
||||
|
||||
def run(self):
|
||||
label = self.arguments[0]
|
||||
node = SelectorGroup()
|
||||
node["label"] = label
|
||||
node["key"] = normalize_key(self.options.get("key", label))
|
||||
node["show-when"] = self.options.get("show-when", "")
|
||||
node["heading-width"] = self.options.get("heading-width", 3)
|
||||
node["icon"] = self.options.get("icon")
|
||||
|
||||
# Parse nested content
|
||||
self.state.nested_parse(self.content, self.content_offset, node)
|
||||
|
||||
# Find all SelectorOption descendants
|
||||
option_nodes = list(node.findall(SelectorOption))
|
||||
|
||||
if option_nodes:
|
||||
# Set the group key on all options
|
||||
for opt in option_nodes:
|
||||
opt["group_key"] = node["key"]
|
||||
|
||||
# Find all options marked as default
|
||||
default_options = [opt for opt in option_nodes if opt["default"]]
|
||||
|
||||
if default_options:
|
||||
# Multiple options marked :default: - only keep first as default
|
||||
for i, opt in enumerate(default_options):
|
||||
if i > 0:
|
||||
opt["default"] = False
|
||||
else:
|
||||
# No explicit default - make first option default
|
||||
option_nodes[0]["default"] = True
|
||||
|
||||
return [node]
|
||||
|
||||
|
||||
class SelectorOption(nodes.General, nodes.Element):
|
||||
"""
|
||||
A selectable tile within a selector group.
|
||||
|
||||
rST usage:
|
||||
|
||||
.. selector-option::
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def visit_html(translator, node):
|
||||
label = node["label"]
|
||||
value = node["value"]
|
||||
disable_when_attr = kv_to_data_attr("disable-when", node["disable-when"])
|
||||
default = node["default"]
|
||||
width = node["width"]
|
||||
group_key = node.get("group_key", "")
|
||||
|
||||
default_class = "rocm-docs-selector-option-default" if default else ""
|
||||
|
||||
translator.body.append(
|
||||
"<!-- start selector-option tile -->"
|
||||
f"""
|
||||
<div class="rocm-docs-selector-option {default_class} col-{width} px-2"
|
||||
data-selector-key="{group_key}"
|
||||
data-selector-value="{value}"
|
||||
{disable_when_attr}
|
||||
tabindex="0"
|
||||
role="radio"
|
||||
aria-checked="false"
|
||||
>
|
||||
<span>{label}</span>
|
||||
""".strip()
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def depart_html(translator, node):
|
||||
icon = node["icon"]
|
||||
if icon:
|
||||
translator.body.append(f'<i class="rocm-docs-selector-icon {icon}"></i>')
|
||||
translator.body.append("</div>" "<!-- end selector-option tile -->")
|
||||
|
||||
|
||||
class SelectorOptionDirective(SphinxDirective):
|
||||
required_arguments = 1 # text of tile
|
||||
final_argument_whitespace = True
|
||||
option_spec = {
|
||||
"value": directives.unchanged,
|
||||
"disable-when": directives.unchanged,
|
||||
"default": directives.flag,
|
||||
"width": directives.nonnegative_int,
|
||||
"icon": directives.unchanged,
|
||||
}
|
||||
has_content = True
|
||||
|
||||
def run(self):
|
||||
label = self.arguments[0]
|
||||
node = SelectorOption()
|
||||
node["label"] = label
|
||||
node["value"] = normalize_key(self.options.get("value", label))
|
||||
# node["show-when"] = self.options.get("show-when", "")
|
||||
node["disable-when"] = self.options.get("disable-when", "")
|
||||
node["default"] = self.options.get("default", False) is not False
|
||||
node["width"] = self.options.get("width", 6)
|
||||
node["icon"] = self.options.get("icon")
|
||||
|
||||
# Content replaces label if provided
|
||||
# if self.content:
|
||||
# self.state.nested_parse(self.content, self.content_offset, node)
|
||||
# else:
|
||||
# node += nodes.Text(label)
|
||||
return [node]
|
||||
|
||||
class SelectedContent(nodes.General, nodes.Element):
|
||||
"""
|
||||
A container to hold conditional content.
|
||||
|
||||
rST usage::
|
||||
|
||||
.. selected-content:: os=ubuntu
|
||||
:heading: Ubuntu Notes
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def visit_html(translator, node):
|
||||
show_when_attr = kv_to_data_attr("show-when", node["show-when"])
|
||||
classes = " ".join(node.get("class", []))
|
||||
heading = node.get("heading", "")
|
||||
heading_level = node.get("heading-level") or (SelectedContent._get_depth(node) + 1)
|
||||
heading_level = min(heading_level, 6)
|
||||
|
||||
heading_elem = ""
|
||||
if heading:
|
||||
# HACK to fix secondary sidebar observer
|
||||
suffix = "".join(random.choices(string.ascii_lowercase + string.digits, k=3))
|
||||
id_attr = nodes.make_id(f"{heading}-{suffix}")
|
||||
|
||||
heading_elem = (
|
||||
f'<h{heading_level} id="{id_attr}" class="rocm-docs-custom-heading">'
|
||||
f'{heading}<a class="headerlink" href="#{id_attr}" title="Link to this heading">#</a>'
|
||||
f'</h{heading_level}>'
|
||||
)
|
||||
|
||||
translator.body.append(
|
||||
f"""
|
||||
<!-- start selected-content -->
|
||||
<div class="rocm-docs-selected-content {classes}" {show_when_attr} aria-hidden="true">
|
||||
{heading_elem}
|
||||
""".strip()
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def depart_html(translator, _):
|
||||
translator.body.append("</div><!-- end selected-content -->")
|
||||
|
||||
@staticmethod
|
||||
def _get_depth(node):
|
||||
depth = 1
|
||||
parent = node.parent
|
||||
while parent is not None:
|
||||
if isinstance(parent, SelectedContent) and parent.get("heading"):
|
||||
depth += 1
|
||||
parent = getattr(parent, "parent", None)
|
||||
return depth
|
||||
|
||||
|
||||
class SelectedContentDirective(SphinxDirective):
|
||||
required_arguments = 1 # condition (e.g., os=ubuntu)
|
||||
final_argument_whitespace = True
|
||||
has_content = True
|
||||
option_spec = {
|
||||
"id": directives.unchanged,
|
||||
"class": directives.unchanged,
|
||||
"heading": directives.unchanged,
|
||||
"heading-level": directives.nonnegative_int,
|
||||
}
|
||||
|
||||
def run(self):
|
||||
node = SelectedContent()
|
||||
node["show-when"] = self.arguments[0]
|
||||
node["id"] = self.options.get("id", "")
|
||||
node["class"] = self.options.get("class", "")
|
||||
node["heading"] = self.options.get("heading", "") # empty string if none
|
||||
node["heading-level"] = self.options.get("heading-level", None)
|
||||
|
||||
# Parse nested content
|
||||
self.state.nested_parse(self.content, self.content_offset, node)
|
||||
return [node]
|
||||
|
||||
|
||||
def setup(app):
|
||||
app.add_node(
|
||||
SelectorGroup,
|
||||
html=(SelectorGroup.visit_html, SelectorGroup.depart_html),
|
||||
)
|
||||
app.add_node(
|
||||
SelectorOption,
|
||||
html=(SelectorOption.visit_html, SelectorOption.depart_html),
|
||||
)
|
||||
app.add_node(
|
||||
SelectedContent,
|
||||
html=(SelectedContent.visit_html, SelectedContent.depart_html),
|
||||
)
|
||||
|
||||
app.add_directive("selector", SelectorGroupDirective)
|
||||
app.add_directive("selector-option", SelectorOptionDirective)
|
||||
app.add_directive("selected-content", SelectedContentDirective)
|
||||
app.add_directive("selected", SelectedContentDirective)
|
||||
|
||||
static_assets_dir = Path(__file__).parent / "static"
|
||||
app.config.html_static_path.append(str(static_assets_dir))
|
||||
app.add_css_file("selector.css")
|
||||
app.add_js_file("selector.js", type="module", defer="defer")
|
||||
|
||||
return {"version": "1.0", "parallel_read_safe": True}
|
||||
103
docs/extension/rocm_docs_custom/static/selector.css
Normal file
@@ -0,0 +1,103 @@
|
||||
html {
|
||||
--rocm-docs-selector-border-radius: 2px;
|
||||
--rocm-docs-selector-accent-color: var(--pst-color-primary);
|
||||
--rocm-docs-selector-bg-color: var(--pst-color-on-background);
|
||||
--rocm-docs-selector-fg-color: var(--pst-color-primary-text);
|
||||
--rocm-docs-selector-group-heading-bg-color: var(--pst-color-surface);
|
||||
--rocm-docs-selector-option-hover-color: var(--pst-color-link-hover);
|
||||
--rocm-docs-selector-option-selected-color: var(--pst-color-primary);
|
||||
--rocm-docs-selector-tile-padding: 0.2rem;
|
||||
}
|
||||
|
||||
html[data-theme="light"] {
|
||||
--rocm-docs-selector-border-color: var(--pst-gray-400);
|
||||
--rocm-docs-selector-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
|
||||
--rocm-docs-selector-shadow-hover: 0 2px 8px rgba(0, 0, 0, 0.15);
|
||||
}
|
||||
|
||||
html[data-theme="dark"] {
|
||||
--rocm-docs-selector-border-color: var(--pst-color-border);
|
||||
--rocm-docs-selector-shadow: 0 1px 3px rgba(0, 0, 0, 0.3);
|
||||
--rocm-docs-selector-shadow-hover: 0 2px 8px rgba(0, 0, 0, 0.4);
|
||||
}
|
||||
|
||||
/* Selector container */
|
||||
.rocm-docs-selector-container {
|
||||
padding: 0 0 1rem 0;
|
||||
}
|
||||
|
||||
/* Selector group heading when one of its options is hovered */
|
||||
.rocm-docs-selector-group:has(.rocm-docs-selector-option:hover)
|
||||
.rocm-docs-selector-group-heading {
|
||||
border-right-color: var(--rocm-docs-selector-option-hover-color);
|
||||
}
|
||||
|
||||
/* Selector group heading box */
|
||||
.rocm-docs-selector-group-heading {
|
||||
background-color: var(--rocm-docs-selector-group-heading-bg-color);
|
||||
padding: var(--rocm-docs-selector-tile-padding);
|
||||
font-weight: 600;
|
||||
border-right: solid 3px var(--rocm-docs-selector-accent-color);
|
||||
border-radius: var(--rocm-docs-selector-border-radius);
|
||||
transition: border-right-color 0.25s ease;
|
||||
box-shadow: var(--rocm-docs-selector-shadow);
|
||||
}
|
||||
|
||||
/* Selector group heading text */
|
||||
.rocm-docs-selector-group-heading > span {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
|
||||
/* Selector option */
|
||||
.rocm-docs-selector-option {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
background-color: var(--rocm-docs-selector-bg-color);
|
||||
padding: var(--rocm-docs-selector-tile-padding);
|
||||
border: solid 2px var(--rocm-docs-selector-border-color);
|
||||
cursor: pointer;
|
||||
transition: all 0.2 ease;
|
||||
border-radius: var(--rocm-docs-selector-border-radius);
|
||||
box-shadow: var(--rocm-docs-selector-shadow);
|
||||
}
|
||||
|
||||
/* Selector option when hovered */
|
||||
.rocm-docs-selector-option:hover:not(.rocm-docs-disabled) {
|
||||
background-color: var(--rocm-docs-selector-option-hover-color);
|
||||
color: var(--rocm-docs-selector-fg-color);
|
||||
transform: translateY(-2px);
|
||||
box-shadow: var(--rocm-docs-selector-shadow-hover);
|
||||
}
|
||||
|
||||
.rocm-docs-selector-option:focus:not(.rocm-docs-disabled) {
|
||||
z-index: 69;
|
||||
}
|
||||
|
||||
/* Selector option when selected */
|
||||
.rocm-docs-selector-option.rocm-docs-selected {
|
||||
background-color: var(--rocm-docs-selector-option-selected-color);
|
||||
color: var(--rocm-docs-selector-fg-color);
|
||||
}
|
||||
|
||||
/* Prevent hover effect on selected */
|
||||
.rocm-docs-selector-option.rocm-docs-selected:hover {
|
||||
transform: none;
|
||||
}
|
||||
|
||||
/* Selector option when disabled */
|
||||
.rocm-docs-selector-option.rocm-docs-disabled {
|
||||
background-color: var(--rocm-docs-selector-border-color);
|
||||
color: var(--rocm-docs-selector-fg-color);
|
||||
cursor: not-allowed;
|
||||
pointer-events: none;
|
||||
}
|
||||
|
||||
/* Hidden state */
|
||||
.rocm-docs-hidden {
|
||||
display: none;
|
||||
}
|
||||
305
docs/extension/rocm_docs_custom/static/selector.js
Normal file
@@ -0,0 +1,305 @@
|
||||
const READY_EVENT = "ROCmDocsSelectorsReady";
|
||||
const STATE_CHANGE_EVENT = "ROCmDocsSelectorStateChanged";
|
||||
|
||||
const GROUP_QUERY = ".rocm-docs-selector-group";
|
||||
const OPTION_QUERY = ".rocm-docs-selector-option";
|
||||
const COND_QUERY = "[data-show-when]";
|
||||
const TOC2_OPTIONS_LIST_QUERY = ".rocm-docs-selector-toc2-options";
|
||||
const TOC2_CONTENTS_LIST_QUERY = ".rocm-docs-selector-toc2-contents";
|
||||
const HEADING_QUERY = ".rocm-docs-selected-content h1,h2,h3,h4,h5,h6[id]";
|
||||
|
||||
const isDefaultOption = (elem) =>
|
||||
elem.classList.contains("rocm-docs-selector-option-default");
|
||||
|
||||
const DISABLED_CLASS = "rocm-docs-disabled";
|
||||
const disable = (elem) => {
|
||||
elem.classList.add(DISABLED_CLASS);
|
||||
elem.setAttribute("aria-disabled", "true");
|
||||
elem.setAttribute("tabindex", "-1");
|
||||
};
|
||||
// const enable = (elem) => {
|
||||
// elem.classList.remove(DISABLED_CLASS);
|
||||
// elem.setAttribute("aria-disabled", "false");
|
||||
// elem.setAttribute("tabindex", "0");
|
||||
// };
|
||||
|
||||
const HIDDEN_CLASS = "rocm-docs-hidden";
|
||||
const hide = (elem) => {
|
||||
elem.classList.add(HIDDEN_CLASS);
|
||||
elem.setAttribute("aria-hidden", "true");
|
||||
};
|
||||
const show = (elem) => {
|
||||
elem.classList.remove(HIDDEN_CLASS);
|
||||
elem.setAttribute("aria-hidden", "false");
|
||||
};
|
||||
|
||||
const SELECTED_CLASS = "rocm-docs-selected";
|
||||
const select = (elem) => {
|
||||
elem.classList.add(SELECTED_CLASS);
|
||||
elem.setAttribute("aria-checked", "true");
|
||||
};
|
||||
const deselect = (elem) => {
|
||||
elem.classList.remove(SELECTED_CLASS);
|
||||
elem.setAttribute("aria-checked", "false");
|
||||
};
|
||||
|
||||
const state = {};
|
||||
|
||||
function getState() {
|
||||
return { ...state };
|
||||
}
|
||||
|
||||
function setState(updates) {
|
||||
const previousState = getState();
|
||||
Object.assign(state, updates);
|
||||
|
||||
const event = new CustomEvent(STATE_CHANGE_EVENT, {
|
||||
detail: {
|
||||
previousState,
|
||||
currentState: getState(),
|
||||
changes: updates,
|
||||
},
|
||||
});
|
||||
document.dispatchEvent(event);
|
||||
}
|
||||
|
||||
function validateOptionElem(optionElem) {
|
||||
const key = optionElem.dataset.selectorKey;
|
||||
const value = optionElem.dataset.selectorValue;
|
||||
|
||||
const errors = [];
|
||||
if (!key) errors.push("Missing 'data-selector-key'");
|
||||
if (!value) errors.push("Missing 'data-selector-value'");
|
||||
|
||||
if (errors.length === 0) return;
|
||||
|
||||
const label = optionElem.textContent.trim() || "<unnamed option>";
|
||||
console.error(
|
||||
`[ROCmDocsSelector] Invalid selector option '${label}': ${
|
||||
errors.join(", ")
|
||||
}!`,
|
||||
);
|
||||
disable(optionElem);
|
||||
}
|
||||
|
||||
function handleOptionSelect(e) {
|
||||
const option = e.currentTarget;
|
||||
const parentGroup = option.closest(GROUP_QUERY);
|
||||
const siblingOptions = parentGroup.querySelectorAll(OPTION_QUERY);
|
||||
|
||||
siblingOptions.forEach((elem) => deselect(elem));
|
||||
select(option);
|
||||
|
||||
// Update global state
|
||||
const key = option.dataset.selectorKey;
|
||||
const value = option.dataset.selectorValue;
|
||||
if (key && value) setState({ [key]: value });
|
||||
|
||||
updateVisibility();
|
||||
}
|
||||
|
||||
function shouldBeShown(elem) {
|
||||
const conditionsData = elem.dataset.showWhen;
|
||||
if (!conditionsData) return true; // Default visible
|
||||
|
||||
try {
|
||||
const conditions = JSON.parse(conditionsData);
|
||||
// Ensure it's an object
|
||||
if (typeof conditions !== "object" || Array.isArray(conditions)) {
|
||||
console.warn(
|
||||
"[ROCmDocsSelector] Invalid 'show-when' format (must be key/value object):",
|
||||
conditionsData,
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
for (const [key, value] of Object.entries(conditions)) {
|
||||
const currentValue = state[key];
|
||||
|
||||
if (currentValue === undefined) return false;
|
||||
|
||||
if (Array.isArray(value)) {
|
||||
if (!value.includes(currentValue)) return false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (state[key] !== value) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
} catch (err) {
|
||||
console.error(
|
||||
"[ROCmDocsSelector] Couldn't parse 'show-when' conditions:",
|
||||
err,
|
||||
);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
function updateTOC2OptionsList() {
|
||||
const tocOptionsList = document.querySelector(TOC2_OPTIONS_LIST_QUERY);
|
||||
if (!tocOptionsList) return;
|
||||
|
||||
// Clear previous entries
|
||||
tocOptionsList.innerHTML = "";
|
||||
|
||||
// Get only visible selector groups
|
||||
const groups = Array.from(document.querySelectorAll(GROUP_QUERY)).filter(
|
||||
(g) => g.offsetParent !== null,
|
||||
);
|
||||
|
||||
if (groups.length === 0) {
|
||||
const li = document.createElement("li");
|
||||
li.className =
|
||||
"nav-item toc-entry toc-h3 rocm-docs-selector-toc2-item empty";
|
||||
const span = document.createElement("span");
|
||||
span.textContent = "(no visible selectors)";
|
||||
li.appendChild(span);
|
||||
tocOptionsList.appendChild(li);
|
||||
return;
|
||||
}
|
||||
|
||||
groups.forEach((group) => {
|
||||
// ✅ Find group heading span
|
||||
const headingSpan = group.querySelector(
|
||||
".rocm-docs-selector-group-heading-text",
|
||||
);
|
||||
const headingText = headingSpan
|
||||
? headingSpan.textContent.trim()
|
||||
: "(Unnamed Selector)";
|
||||
|
||||
// Find currently selected option
|
||||
const selectedOption = group.querySelector(`.${SELECTED_CLASS}`);
|
||||
let optionText = "(none selected)";
|
||||
if (selectedOption) {
|
||||
const clone = selectedOption.cloneNode(true);
|
||||
// Remove all <i> elements
|
||||
clone.querySelectorAll("i, svg").forEach((el) => el.remove());
|
||||
optionText = clone.innerHTML.trim();
|
||||
}
|
||||
|
||||
// Build list item
|
||||
const li = document.createElement("li");
|
||||
li.className = "nav-item toc-entry toc-h3 rocm-docs-selector-toc2-item";
|
||||
|
||||
const link = document.createElement("a");
|
||||
link.className = "nav-link";
|
||||
link.href = `#${group.id}`;
|
||||
link.innerHTML = `<strong>${headingText}</strong>: ${optionText}`;
|
||||
|
||||
li.appendChild(link);
|
||||
tocOptionsList.appendChild(li);
|
||||
});
|
||||
}
|
||||
|
||||
function updateTOC2ContentsList() {
|
||||
const tocOptionsList = document.querySelector(TOC2_OPTIONS_LIST_QUERY);
|
||||
const tocContentsList = document.querySelector(TOC2_CONTENTS_LIST_QUERY);
|
||||
if (!tocContentsList || !tocOptionsList) return;
|
||||
|
||||
const visibleHeaders = [...document.querySelectorAll(HEADING_QUERY)]
|
||||
.filter((h) => h.offsetParent !== null); // only visible headings
|
||||
|
||||
tocContentsList
|
||||
.querySelectorAll("li.toc-entry.rocm-docs-selector-toc2-item")
|
||||
.forEach((node) => node.remove());
|
||||
|
||||
if (visibleHeaders.length === 0) return;
|
||||
|
||||
let lastH2Li = null;
|
||||
|
||||
visibleHeaders.forEach((h) => {
|
||||
const level = parseInt(h.tagName.substring(1), 10);
|
||||
const li = document.createElement("li");
|
||||
li.className =
|
||||
`nav-item toc-entry toc-${h.tagName.toLowerCase()} rocm-docs-selector-toc2-item`;
|
||||
|
||||
const a = document.createElement("a");
|
||||
a.className = "reference internal nav-link";
|
||||
const section = h.closest("section");
|
||||
const fallbackId = section ? section.id : "";
|
||||
a.href = h.id ? `#${h.id}` : fallbackId ? `#${fallbackId}` : "#";
|
||||
a.textContent = h.cloneNode(true).childNodes[0].textContent.trim();
|
||||
li.appendChild(a);
|
||||
|
||||
// Nest logic: h3+ belong to last h2's inner list
|
||||
if (level === 2) {
|
||||
tocContentsList.appendChild(li);
|
||||
lastH2Li = li;
|
||||
} else if (level === 3 && lastH2Li) {
|
||||
// ensure nested UL exists
|
||||
let innerUl = lastH2Li.querySelector("ul");
|
||||
if (!innerUl) {
|
||||
innerUl = document.createElement("ul");
|
||||
innerUl.className = "nav section-nav flex-column";
|
||||
lastH2Li.appendChild(innerUl);
|
||||
}
|
||||
innerUl.appendChild(li);
|
||||
} else {
|
||||
tocContentsList.appendChild(li);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function updateVisibility() {
|
||||
document.querySelectorAll(COND_QUERY).forEach((elem) => {
|
||||
if (shouldBeShown(elem)) {
|
||||
show(elem);
|
||||
} else {
|
||||
hide(elem);
|
||||
}
|
||||
});
|
||||
|
||||
updateTOC2OptionsList();
|
||||
updateTOC2ContentsList();
|
||||
}
|
||||
|
||||
function init() {
|
||||
const selectorOptions = document.querySelectorAll(OPTION_QUERY);
|
||||
|
||||
const initialState = {};
|
||||
|
||||
// Attach listeners and gather defaults
|
||||
selectorOptions.forEach((option) => {
|
||||
validateOptionElem(option);
|
||||
|
||||
option.addEventListener("click", handleOptionSelect);
|
||||
option.addEventListener("keydown", (e) => {
|
||||
if (e.key === "Enter" || e.key === " ") {
|
||||
e.preventDefault();
|
||||
handleOptionSelect(e);
|
||||
}
|
||||
});
|
||||
|
||||
if (isDefaultOption(option)) {
|
||||
select(option);
|
||||
const { selectorKey: key, selectorValue: value } = option.dataset;
|
||||
if (key && value) {
|
||||
initialState[key] = value;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
setState(initialState);
|
||||
updateVisibility();
|
||||
|
||||
document.dispatchEvent(new CustomEvent(READY_EVENT));
|
||||
}
|
||||
|
||||
function domReady(callback) {
|
||||
if (document.readyState !== "loading") {
|
||||
callback();
|
||||
} else {
|
||||
document.addEventListener("DOMContentLoaded", callback, { once: true });
|
||||
}
|
||||
}
|
||||
|
||||
// window.rocmDocsSelector = {
|
||||
// setState,
|
||||
// getState,
|
||||
// };
|
||||
|
||||
// Initialize when DOM is ready
|
||||
domReady(init);
|
||||
35
docs/extension/rocm_docs_custom/static/table.css
Normal file
@@ -0,0 +1,35 @@
|
||||
html {
|
||||
--rocm-docs-table-bg-color: var(--pst-color-background);
|
||||
--rocm-docs-table-head-bg-color: var(--pst-color-surface);
|
||||
/* --rocm-docs-table-border-radius: 2rem; */
|
||||
}
|
||||
|
||||
html[data-theme="light"] {
|
||||
--rocm-docs-table-border-color: var(--pst-gray-400);
|
||||
--rocm-docs-table-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
html[data-theme="dark"] {
|
||||
--rocm-docs-table-border-color: var(--pst-color-border);
|
||||
--rocm-docs-table-shadow: 0 1px 3px rgba(0, 0, 0, 0.3);
|
||||
}
|
||||
|
||||
.rocm-docs-table {
|
||||
/* border-radius: var(--rocm-docs-table-border-radius); */
|
||||
box-shadow: var(--rocm-docs-table-shadow);
|
||||
}
|
||||
|
||||
.rocm-docs-table thead th {
|
||||
position: sticky;
|
||||
top: 47px; /* 3rem (== 48px) causes a tiny gap */
|
||||
box-shadow: inset 0 -2px var(--rocm-docs-table-border-color);
|
||||
}
|
||||
|
||||
.rocm-docs-table th {
|
||||
background-color: var(--rocm-docs-table-head-bg-color);
|
||||
border: 2px solid var(--rocm-docs-table-border-color);
|
||||
}
|
||||
.rocm-docs-table td {
|
||||
background-color: var(--rocm-docs-table-bg-color);
|
||||
border: 2px solid var(--rocm-docs-table-border-color) !important;
|
||||
}
|
||||
9
docs/extension/rocm_docs_custom/table.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from sphinx.util.docutils import SphinxDirective, directives, nodes
|
||||
from pathlib import Path
|
||||
|
||||
def setup(app):
|
||||
static_assets_dir = Path(__file__).parent / "static"
|
||||
app.config.html_static_path.append(str(static_assets_dir))
|
||||
app.add_css_file("table.css")
|
||||
|
||||
return {"version": "1.0", "parallel_read_safe": True}
|
||||
20
docs/extension/rocm_docs_custom/templates/selector-toc2.html
Normal file
@@ -0,0 +1,20 @@
|
||||
<div class="tocsection onthispage">
|
||||
<i class="fa-solid fa-filter"></i>
|
||||
Options
|
||||
</div>
|
||||
<nav class="page-toc rocm-docs-selector-toc2">
|
||||
<ul
|
||||
class="visible nav section-nav flex-column rocm-docs-selector-toc2-options"
|
||||
>
|
||||
</ul>
|
||||
</nav>
|
||||
<div class="page-toc tocsection onthispage">
|
||||
<i class="fa-solid fa-list"></i>
|
||||
Contents
|
||||
</div>
|
||||
<nav class="bd-toc-nav page-toc rocm-docs-selector-toc2">
|
||||
<ul
|
||||
class="visible nav section-nav flex-column rocm-docs-selector-toc2-contents"
|
||||
>
|
||||
</ul>
|
||||
</nav>
|
||||
30
docs/extension/rocm_docs_custom/utils.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import json
|
||||
import html
|
||||
|
||||
def normalize_key(key):
|
||||
return key.replace(" ", "_").lower().strip()
|
||||
|
||||
def kv_to_data_attr(name, kv_str, separator="="):
|
||||
"""
|
||||
Convert key=value pairs delimited by spaces to stringified JSON.
|
||||
Format it as an HTML data attribute.
|
||||
|
||||
Args:
|
||||
name: Name of the data attribute; it will be prefixed with "data-".
|
||||
condition_str: String in format "key=value os=ubuntu".
|
||||
|
||||
Example output:
|
||||
'data-show-when="{"os": "ubuntu"}"'
|
||||
"""
|
||||
pairs = {}
|
||||
for token in kv_str.split():
|
||||
token = token.strip()
|
||||
if not token or separator not in token:
|
||||
continue
|
||||
|
||||
key, value = token.split(separator, 1)
|
||||
if key and value:
|
||||
pairs.setdefault(key, []).append(value.strip())
|
||||
|
||||
return f'data-{name}="{html.escape(json.dumps(pairs))}"' if pairs else ""
|
||||
|
||||
@@ -1,99 +0,0 @@
|
||||
.. meta::
|
||||
:description: Learn about BAR configuration in AMD GPUs and ways to troubleshoot physical addressing limit
|
||||
:keywords: BAR memory, MMIO, GPU memory, Physical Addressing Limit, AMD, ROCm
|
||||
|
||||
**************************************
|
||||
Troubleshoot BAR access limitation
|
||||
**************************************
|
||||
Direct Memory Access (DMA) to PCIe devices using Base Address Registers (BARs) can be restricted due to physical addressing limits. These restrictions can result in data access failures between the system components. Peer-to-peer (P2P) DMA is used to access resources such as registers and memory between devices. PCIe devices need memory-mapped input/output (MMIO) space for DMA, and these MMIO spaces are defined in the PCIe BARs.
|
||||
|
||||
These BARs are a set of 32-bit or 64-bit registers that are used to define the resources that PCIe devices provide. The CPU and other system devices also use these to access the resources of the PCIe devices. P2P DMA only works when one device can directly access the local BAR memory of another. If the memory address of a BAR memory exceeds the physical addressing limit of a device, the device will not be able to access that BAR. This could be the device's own BAR or the BAR of another device in the system.
|
||||
|
||||
If the BAR memory exceeds than the physical addressing limit of the device, the device will not be able to access the remote BAR.
|
||||
|
||||
To handle any BAR access issues that might occur, you need to be aware of the physical address limitations of the devices and understand the :ref:`BAR configuration of AMD GPUs <bar-configuration>`. This information is important when setting up additional MMIO apertures for PCIe devices in the system's physical address space.
|
||||
|
||||
Handling physical address limitation
|
||||
=============================================
|
||||
When a system boots, the system BIOS allocates the physical address space for the components in the system, including system memory and MMIO apertures. On modern 64-bit platforms, there are generally two or more MMIO apertures: one located below 4 GB of physical address space for 32-bit compatibility, and one or more above 4 GB for devices needing more space.
|
||||
|
||||
You can control the memory address of the high MMIO aperture from the system BIOS configuration options. This lets you configure the additional MMIO space to align with the physical addressing limit and allows P2P DMA between the devices. For example, if a PCIe device is limited to 44-bit of physical addressing, you should ensure that the MMIO aperture is set below 44-bit in the system physical address space.
|
||||
|
||||
There are two ways to handle this:
|
||||
|
||||
* Ensure that the high MMIO aperture is within the physical addressing limits of the devices in the system. For example, if the devices have a 44-bit physical addressing limit, set the ``MMIO High Base`` and ``MMIO High size`` options in the BIOS such that the aperture is within the 44-bit address range, and ensure that the ``Above 4G Decoding`` option is Enabled.
|
||||
|
||||
* Enable the Input-Output Memory Management Unit (IOMMU). When the IOMMU is enabled in non-passthrough mode, it will create a virtual I/O address space for each device on the system. It also ensures that all virtual addresses created in that space are within the physical addressing limits of the device. For more information on IOMMU, see `Input-Output Memory Management Unit (IOMMU) <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/conceptual/iommu.html>`_.
|
||||
|
||||
.. _bar-configuration:
|
||||
|
||||
BAR configuration for AMD GPUs
|
||||
================================================
|
||||
|
||||
The following table shows how the BARs are configured for AMD GPUs.
|
||||
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 25 50
|
||||
:header-rows: 1
|
||||
|
||||
* - BAR Type
|
||||
- Value
|
||||
- Description
|
||||
* - BAR0-1 registers
|
||||
- 64-bit, Prefetchable, GPU memory
|
||||
- 8 GB or 16 GB depending on GPU. Set to less than 2^44 to support P2P access from other GPUs with a 44-bit physical address limit. Prefetchable memory enables faster read operation for high-performance computing (HPC) by fetching the contiguous data from the same data source even before requested as an anticipation of a future request.
|
||||
* - BAR2-3 registers
|
||||
- 64-bit, Prefetchable, Doorbell
|
||||
- Set to less than 2^44 to support P2P access from other GPUs with a 44-bit physical address limit. As a Doorbell BAR, it indicates to the GPU that a new operation is in its queue to be processed.
|
||||
* - BAR4 register
|
||||
- Optional
|
||||
- Not a boot device
|
||||
* - BAR5 register
|
||||
- 32-bit, Non-prefetchable, MMIO
|
||||
- Is set to less than 4 GB.
|
||||
|
||||
Example of BAR usage on AMD GPUs
|
||||
-------------------------------------
|
||||
Following is an example configuration of BARs set by the system BIOS on GFX8 GPUs with the 40-bit physical addressing limit:
|
||||
|
||||
.. code:: shell
|
||||
|
||||
11:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Fiji [Radeon R9 FURY / NANO
|
||||
Series] (rev c1)
|
||||
|
||||
Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0b35
|
||||
|
||||
Flags: bus master, fast devsel, latency 0, IRQ 119
|
||||
|
||||
Memory at bf40000000 (64-bit, prefetchable) [size=256M]
|
||||
|
||||
Memory at bf50000000 (64-bit, prefetchable) [size=2M]
|
||||
|
||||
I/O ports at 3000 [size=256]
|
||||
|
||||
Memory at c7400000 (32-bit, non-prefetchable) [size=256K]
|
||||
|
||||
Expansion ROM at c7440000 [disabled] [size=128K]
|
||||
|
||||
Details of the BARs configured in the example are:
|
||||
|
||||
**GPU Frame Buffer BAR:** ``Memory at bf40000000 (64-bit, prefetchable) [size=256M]``
|
||||
|
||||
The size of the BAR in the example is 256 MB. Generally, it will be the size of the GPU memory (typically 4 GB+). Depending upon the physical address limit and generation of AMD GPUs, the BAR can be set below 2^40, 2^44, or 2^48.
|
||||
|
||||
**Doorbell BAR:** ``Memory at bf50000000 (64-bit, prefetchable) [size=2M]``
|
||||
|
||||
The size of the BAR should typically be less than 10 MB for this generation of GPUs and has been set to 2 MB in the example. This BAR is placed less than 2^40 to allow peer-to-peer access from other generations of AMD GPUs.
|
||||
|
||||
**I/O BAR:** ``I/O ports at 3000 [size=256]``
|
||||
|
||||
This is for legacy VGA and boot device support. Because the GPUs used are not connected to a display (VGA devices), this is not a concern, even if it isn't set up in the system BIOS.
|
||||
|
||||
**MMIO BAR:** ``Memory at c7400000 (32-bit, non-prefetchable) [size=256K]``
|
||||
|
||||
The AMD Driver requires this to access the configuration registers. Since the reminder of the BAR available is only 1 DWORD (32-bit), this is set less than 4 GB. In the example, it is fixed at 256 KB.
|
||||
|
||||
**Expansion ROM:** ``Expansion ROM at c7440000 [disabled] [size=128K]``
|
||||
|
||||
This is required by the AMD Driver to access the GPU video-BIOS. In the example, it is fixed at 128 KB.
|
||||
@@ -1,24 +0,0 @@
|
||||
.. meta::
|
||||
:description: Build ROCm from source
|
||||
:keywords: build ROCm, source, ROCm source, ROCm, repo, make, makefile
|
||||
|
||||
|
||||
.. _building-rocm:
|
||||
|
||||
*************************************************************
|
||||
Build ROCm from source
|
||||
*************************************************************
|
||||
|
||||
ROCm is an open-source stack from which you can build from source code. The source code is available from `<https://github.com/ROCm/ROCm>`__.
|
||||
|
||||
|
||||
The general steps to build ROCm are:
|
||||
|
||||
#. Clone the ROCm source code
|
||||
#. Prepare the build environment
|
||||
#. Run the build command
|
||||
|
||||
Because the ROCm stack is constantly evolving, the most current instructions are stored with the source code in GitHub.
|
||||
For detailed build instructions, see `Getting and Building ROCm from Source <https://github.com/ROCm/ROCm?tab=readme-ov-file#getting-and-building-rocm-from-source>`_.
|
||||
|
||||
|
||||
@@ -1,147 +0,0 @@
|
||||
.. meta::
|
||||
:description: How to install deep learning frameworks for ROCm
|
||||
:keywords: deep learning, frameworks, ROCm, install, PyTorch, TensorFlow, JAX, MAGMA, DeepSpeed, ML, AI
|
||||
|
||||
**********************************
|
||||
Deep learning frameworks for ROCm
|
||||
**********************************
|
||||
|
||||
Deep learning frameworks provide environments for machine learning, training, fine-tuning, inference, and performance optimization.
|
||||
|
||||
ROCm offers a complete ecosystem for developing and running deep learning applications efficiently. It also provides ROCm-compatible versions of popular frameworks and libraries, such as PyTorch, TensorFlow, JAX, and others.
|
||||
|
||||
The AMD ROCm organization actively contributes to open-source development and collaborates closely with framework organizations. This collaboration ensures that framework-specific optimizations effectively leverage AMD GPUs and accelerators.
|
||||
|
||||
The table below summarizes information about ROCm-enabled deep learning frameworks. It includes details on ROCm compatibility and third-party tool support, installation steps and options, and links to GitHub resources. For a complete list of supported framework versions on ROCm, see the :doc:`Compatibility matrix <../compatibility/compatibility-matrix>` topic.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: 5 3 6 3
|
||||
|
||||
* - Framework
|
||||
- Installation
|
||||
- Installation options
|
||||
- GitHub
|
||||
|
||||
* - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`__
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`__
|
||||
- `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`__
|
||||
- `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`__
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`__
|
||||
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`__
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`__
|
||||
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `Ray <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/ray-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#using-a-prebuilt-docker-image-with-ray-pre-installed>`__
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#install-ray-on-bare-metal-or-a-custom-container>`__
|
||||
- `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#build-your-own-docker-image>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/ray"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `llama.cpp <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/llama-cpp-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#use-a-prebuilt-docker-image-with-llama-cpp-pre-installed>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
|
||||
through the following guides.
|
||||
|
||||
* :doc:`rocm-for-ai/index`
|
||||
|
||||
* :doc:`Use ROCm for training <rocm-for-ai/training/index>`
|
||||
|
||||
* :doc:`Use ROCm for fine-tuning LLMs <rocm-for-ai/fine-tuning/index>`
|
||||
|
||||
* :doc:`Use ROCm for AI inference <rocm-for-ai/inference/index>`
|
||||
|
||||
* :doc:`Use ROCm for AI inference optimization <rocm-for-ai/inference-optimization/index>`
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
.. meta::
|
||||
:description: How to configure MI300X accelerators to fully leverage their capabilities and achieve optimal performance.
|
||||
:keywords: ROCm, AI, machine learning, MI300X, LLM, usage, tutorial, optimization, tuning
|
||||
|
||||
**************************************
|
||||
AMD Instinct MI300X performance guides
|
||||
**************************************
|
||||
|
||||
The following performance guides provide essential guidance on the necessary
|
||||
steps to properly `configure your system for AMD Instinct™ MI300X accelerators
|
||||
<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
They include detailed instructions on system settings and application
|
||||
:doc:`workload tuning </how-to/rocm-for-ai/inference-optimization/workload>` to
|
||||
help you leverage the maximum capabilities of these accelerators and achieve
|
||||
superior performance.
|
||||
|
||||
* `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__
|
||||
covers essential system settings and system management practices to configure
|
||||
your AMD Instinct MI300X system for performance.
|
||||
|
||||
* :doc:`/how-to/rocm-for-ai/inference-optimization/workload` covers steps to
|
||||
optimize the performance of AMD Instinct MI300X series accelerators for HPC
|
||||
and deep learning operations.
|
||||
|
||||
* :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm` introduces a preconfigured
|
||||
environment for LLM inference, designed to help you test performance with
|
||||
popular models on AMD Instinct MI300X series accelerators.
|
||||
@@ -1,39 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Programming guide
|
||||
:keywords: HIP, programming guide, heterogeneous programming, AMD GPU programming
|
||||
|
||||
.. _hip-programming-guide:
|
||||
|
||||
********************************************************************************
|
||||
Programming guide
|
||||
********************************************************************************
|
||||
|
||||
ROCm provides a robust environment for heterogeneous programs running on CPUs
|
||||
and AMD GPUs. ROCm supports various programming languages and frameworks to
|
||||
help developers access the power of AMD GPUs. The natively supported programming
|
||||
languages are HIP (Heterogeneous-Compute Interface for Portability) and
|
||||
OpenCL, but HIP bindings are available for Python and Fortran.
|
||||
|
||||
HIP is an API based on C++ that provides a runtime and kernel language for GPU
|
||||
programming and is the essential ROCm programming language. HIP is also designed
|
||||
to be a marshalling language, allowing code written for NVIDIA CUDA to be
|
||||
easily ported to run on AMD GPUs. Developers can use HIP to write kernels that
|
||||
execute on AMD GPUs while maintaining compatibility with CUDA-based systems.
|
||||
|
||||
OpenCL (Open Computing Language) is an open standard for cross-platform,
|
||||
parallel programming of diverse processors. ROCm supports OpenCL for developers
|
||||
who want to use standard frameworks across different hardware platforms,
|
||||
including CPUs, GPUs, and other accelerators. For more information, see
|
||||
`OpenCL <https://www.khronos.org/opencl/>`_.
|
||||
|
||||
Python bindings can be found at https://github.com/ROCm/hip-python.
|
||||
Python is popular in AI and machine learning applications due to available
|
||||
frameworks like TensorFlow and PyTorch.
|
||||
|
||||
Fortran bindings can be found at https://github.com/ROCm/hipfort.
|
||||
It enables scientific, academic, and legacy applications, particularly those in
|
||||
high-performance computing, to run on AMD GPUs via HIP.
|
||||
|
||||
For a complete description of the HIP programming language, see the :doc:`HIP programming guide<hip:index>`.
|
||||
@@ -1,20 +0,0 @@
|
||||
.. meta::
|
||||
:description: How to fine-tune models with ROCm
|
||||
:keywords: ROCm, LLM, fine-tuning, inference, usage, tutorial, deep learning, PyTorch, TensorFlow, JAX
|
||||
|
||||
*************************
|
||||
Fine-tuning and inference
|
||||
*************************
|
||||
|
||||
Fine-tuning using ROCm involves leveraging AMD's GPU-accelerated :doc:`libraries <rocm:reference/api-libraries>` and
|
||||
:doc:`tools <rocm:reference/rocm-tools>` to optimize and train deep learning models. ROCm provides a comprehensive
|
||||
ecosystem for deep learning development, including open-source libraries for optimized deep learning operations and
|
||||
ROCm-aware versions of :doc:`deep learning frameworks <../../deep-learning-rocm>` such as PyTorch, TensorFlow, and JAX.
|
||||
|
||||
Single-accelerator systems, such as a machine equipped with a single accelerator or GPU, are commonly used for
|
||||
smaller-scale deep learning tasks, including fine-tuning pre-trained models and running inference on moderately
|
||||
sized datasets. See :doc:`single-gpu-fine-tuning-and-inference`.
|
||||
|
||||
Multi-accelerator systems, on the other hand, consist of multiple accelerators working in parallel. These systems are
|
||||
typically used in LLMs and other large-scale deep learning tasks where performance, scalability, and the handling of
|
||||
massive datasets are crucial. See :doc:`multi-gpu-fine-tuning-and-inference`.
|
||||
@@ -1,26 +0,0 @@
|
||||
.. meta::
|
||||
:description: How to fine-tune LLMs with ROCm
|
||||
:keywords: ROCm, LLM, fine-tuning, usage, tutorial, GPUs, Llama, accelerators
|
||||
|
||||
*******************************************
|
||||
Use ROCm for fine-tuning LLMs
|
||||
*******************************************
|
||||
|
||||
Fine-tuning is an essential technique in machine learning, where a pre-trained model, typically trained on a large-scale dataset, is further refined to achieve better performance and adapt to a particular task or dataset of interest.
|
||||
|
||||
With AMD GPUs, the fine-tuning process benefits from the parallel processing capabilities and efficient resource management, ultimately leading to improved performance and faster model adaptation to the target domain.
|
||||
|
||||
The ROCm™ software platform helps you optimize this fine-tuning process by supporting various optimization techniques tailored for AMD GPUs. It empowers the fine-tuning of large language models, making them accessible and efficient for specialized tasks. ROCm supports the broader AI ecosystem to ensure seamless integration with open frameworks, models, and tools.
|
||||
|
||||
Throughout the following topics, this guide discusses the goals and :ref:`challenges of fine-tuning a large language
|
||||
model <fine-tuning-llms-concept-challenge>` like Llama 2. In the
|
||||
sections that follow, you'll find practical guides on libraries and tools to accelerate your fine-tuning.
|
||||
|
||||
The AI Developer Hub contains `AMD ROCm tutorials <https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/>`_ for
|
||||
training, fine-tuning, and inference. It leverages popular machine learning frameworks on AMD GPUs.
|
||||
|
||||
- :doc:`Conceptual overview of fine-tuning LLMs <overview>`
|
||||
|
||||
- :doc:`Fine-tuning and inference <fine-tuning-and-inference>` using a
|
||||
:doc:`single-accelerator <single-gpu-fine-tuning-and-inference>` or
|
||||
:doc:`multi-accelerator <multi-gpu-fine-tuning-and-inference>` system.
|
||||
@@ -1,236 +0,0 @@
|
||||
.. meta::
|
||||
:description: Model fine-tuning and inference on a multi-GPU system
|
||||
:keywords: ROCm, LLM, fine-tuning, usage, tutorial, multi-GPU, distributed, inference, accelerators, PyTorch, HuggingFace, torchtune
|
||||
|
||||
*****************************************************
|
||||
Fine-tuning and inference using multiple accelerators
|
||||
*****************************************************
|
||||
|
||||
This section explains how to fine-tune a model on a multi-accelerator system. See
|
||||
:doc:`Single-accelerator fine-tuning <single-gpu-fine-tuning-and-inference>` for a single accelerator or GPU setup.
|
||||
|
||||
.. _fine-tuning-llms-multi-gpu-env:
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
|
||||
This section was tested using the following hardware and software environment.
|
||||
|
||||
.. list-table::
|
||||
:stub-columns: 1
|
||||
|
||||
* - Hardware
|
||||
- 4 AMD Instinct MI300X accelerators
|
||||
|
||||
* - Software
|
||||
- ROCm 6.1, Ubuntu 22.04, PyTorch 2.1.2, Python 3.10
|
||||
|
||||
* - Libraries
|
||||
- ``transformers`` ``datasets`` ``accelerate`` ``huggingface-hub`` ``peft`` ``trl`` ``scipy``
|
||||
|
||||
* - Base model
|
||||
- ``meta-llama/Llama-2-7b-chat-hf``
|
||||
|
||||
.. _fine-tuning-llms-multi-gpu-env-setup:
|
||||
|
||||
Setting up the base implementation environment
|
||||
----------------------------------------------
|
||||
|
||||
#. Install PyTorch for ROCm. Refer to the
|
||||
:doc:`PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`. For consistent
|
||||
installation, it’s recommended to use official ROCm prebuilt Docker images with the framework pre-installed.
|
||||
|
||||
#. In the Docker container, check the availability of ROCM-capable accelerators using the following command.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
rocm-smi --showproductname
|
||||
|
||||
#. Check that your accelerators are available to PyTorch.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import torch
|
||||
print("Is a ROCm-GPU detected? ", torch.cuda.is_available())
|
||||
print("How many ROCm-GPUs are detected? ", torch.cuda.device_count())
|
||||
|
||||
If successful, your output should look like this:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
>>> print("Is a ROCm-GPU detected? ", torch.cuda.is_available())
|
||||
Is a ROCm-GPU detected? True
|
||||
>>> print("How many ROCm-GPUs are detected? ", torch.cuda.device_count())
|
||||
How many ROCm-GPUs are detected? 4
|
||||
|
||||
.. tip::
|
||||
|
||||
During training and inference, you can check the memory usage by running the ``rocm-smi`` command in your terminal.
|
||||
This tool helps you see shows which accelerators or GPUs are involved.
|
||||
|
||||
|
||||
.. _fine-tuning-llms-multi-gpu-hugging-face-accelerate:
|
||||
|
||||
Hugging Face Accelerate for fine-tuning and inference
|
||||
===========================================================
|
||||
|
||||
`Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_ is a library that simplifies turning raw
|
||||
PyTorch code for a single accelerator into code for multiple accelerators for LLM fine-tuning and inference. It is
|
||||
integrated with `Transformers <https://huggingface.co/docs/transformers/en/index>`_ allowing you to scale your PyTorch
|
||||
code while maintaining performance and flexibility.
|
||||
|
||||
As a brief example of model fine-tuning and inference using multiple GPUs, let's use Transformers and load in the Llama
|
||||
2 7B model.
|
||||
|
||||
Here, let's reuse the code in :ref:`Single-accelerator fine-tuning <fine-tuning-llms-single-gpu-download-model-dataset>`
|
||||
to load the base model and tokenizer.
|
||||
|
||||
Now, it's important to adjust how you load the model. Add the ``device_map`` parameter to your base model configuration.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
...
|
||||
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
|
||||
|
||||
# Load base model to GPU memory
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
base_model_name,
|
||||
device_map = "auto",
|
||||
trust_remote_code = True)
|
||||
...
|
||||
# Run training
|
||||
sft_trainer.train()
|
||||
|
||||
.. note::
|
||||
|
||||
You can let Accelerate handle the device map computation by setting ``device_map`` to one of the supported options
|
||||
(``"auto"``, ``"balanced"``, ``"balanced_low_0"``, ``"sequential"``).
|
||||
|
||||
It's recommended to set the ``device_map`` parameter to ``“auto”`` to allow Accelerate to automatically and
|
||||
efficiently allocate the model given the available resources (4 accelerators in this case).
|
||||
|
||||
When you have more GPU memory available than the model size, here is the difference between each ``device_map``
|
||||
option:
|
||||
|
||||
* ``"auto"`` and ``"balanced"`` evenly split the model on all available GPUs, making it possible for you to use a
|
||||
batch size greater than 1.
|
||||
|
||||
* ``"balanced_low_0"`` evenly splits the model on all GPUs except the first
|
||||
one, and only puts on GPU 0 what does not fit on the others. This
|
||||
option is great when you need to use GPU 0 for some processing of the
|
||||
outputs, like when using the generate function for Transformers
|
||||
models.
|
||||
|
||||
* ``"sequential"`` will fit what it can on GPU 0, then move on GPU 1 and so forth. Not all GPUs might be used.
|
||||
|
||||
After loading the model in this way, the model is fully ready to use the resources available to it.
|
||||
|
||||
.. _fine-tuning-llms-multi-gpu-torchtune:
|
||||
|
||||
torchtune for fine-tuning and inference
|
||||
=============================================
|
||||
|
||||
`torchtune <https://pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-accelerator or
|
||||
GPU model fine-tuning and inference with LLMs.
|
||||
|
||||
#. Install torchtune using pip.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Install torchtune with PyTorch release 2.2.2+
|
||||
pip install torchtune
|
||||
|
||||
# To confirm that the package is installed correctly
|
||||
tune --help
|
||||
|
||||
The output should look like this:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
usage: tune [-h] {download,ls,cp,run,validate} ...
|
||||
|
||||
Welcome to the TorchTune CLI!
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
|
||||
subcommands:
|
||||
{download,ls,cp,run,validate}
|
||||
|
||||
#. torchtune recipes are designed around easily composable components and workable training loops, with minimal abstraction
|
||||
getting in the way of fine-tuning. Run ``tune ls`` to show built-in torchtune configuration recipes.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
RECIPE CONFIG
|
||||
full_finetune_single_device llama2/7B_full_low_memory
|
||||
llama3/8B_full_single_device
|
||||
mistral/7B_full_low_memory
|
||||
full_finetune_distributed llama2/7B_full
|
||||
llama2/13B_full
|
||||
llama3/8B_full
|
||||
mistral/7B_full
|
||||
gemma/2B_full
|
||||
lora_finetune_single_device llama2/7B_lora_single_device
|
||||
llama2/7B_qlora_single_device
|
||||
llama3/8B_lora_single_device
|
||||
llama3/8B_qlora_single_device
|
||||
llama2/13B_qlora_single_device
|
||||
mistral/7B_lora_single_device
|
||||
|
||||
The ``RECIPE`` column shows the easy-to-use and workable fine-tuning and inference recipes for popular fine-tuning
|
||||
techniques (such as LoRA). The ``CONFIG`` column lists the YAML configurations for easily configuring training,
|
||||
evaluation, quantization, or inference recipes.
|
||||
|
||||
The snippet shows the architecture of a model's YAML configuration file:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
# Model arguments
|
||||
model:
|
||||
_component_: torchtune.models.llama2.lora_llama2_7b
|
||||
lora_attn_modules: ['q_proj', 'v_proj']
|
||||
apply_lora_to_mlp: False
|
||||
apply_lora_to_output: False
|
||||
lora_rank: 8
|
||||
lora_alpha: 16
|
||||
|
||||
tokenizer:
|
||||
_component_: torchtune.models.llama2.llama2_tokenizer
|
||||
path: /tmp/Llama-2-7b-hf/tokenizer.model
|
||||
|
||||
# Dataset and sampler
|
||||
dataset:
|
||||
_component_: torchtune.datasets.alpaca_cleaned_dataset
|
||||
train_on_input: True
|
||||
|
||||
#. This configuration file defines the fine-tuning base model path, data set, hyper-parameters for optimizer and scheduler,
|
||||
and training data type. To download the base model for fine-tuning, run the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
tune download meta-llama/Llama-2-7b-hf --output-dir /tmp/Llama-2-7b-hf --hf-token
|
||||
|
||||
The output directory argument for ``--output-dir`` should map the model path specified in YAML config file.
|
||||
|
||||
#. To launch ``lora_finetune_distributed`` on four devices, run the following
|
||||
command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
tune run --nnodes 1 --nproc_per_node 4 lora_finetune_distributed --config llama2/7B_lora
|
||||
|
||||
If successful, you should something like the following output:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
INFO:torchtune.utils.logging:FSDP is enabled. Instantiating Model on CPU for Rank 0 ...
|
||||
INFO:torchtune.utils.logging:Model instantiation took 7.32 secs
|
||||
INFO:torchtune.utils.logging:Memory Stats after model init:
|
||||
{'peak_memory_active': 9.478172672, 'peak_memory_alloc': 8.953868288, 'peak_memory_reserved': 11.112808448}
|
||||
INFO:torchtune.utils.logging:Optimizer and loss are initialized.
|
||||
INFO:torchtune.utils.logging:Dataset and Sampler are initialized.
|
||||
INFO:torchtune.utils.logging:Learning rate scheduler is initialized.
|
||||
1|111|Loss: 1.5790324211120605: 7%|█ | 114/1618
|
||||
|
||||
Read more about inference frameworks in :doc:`LLM inference frameworks <../inference/llm-inference-frameworks>`.
|
||||
@@ -1,104 +0,0 @@
|
||||
.. meta::
|
||||
:description: Conceptual overview of fine-tuning LLMs
|
||||
:keywords: ROCm, LLM, Llama, fine-tuning, usage, tutorial, optimzation, LoRA, walkthrough, PEFT, Reinforcement
|
||||
|
||||
***************************************
|
||||
Conceptual overview of fine-tuning LLMs
|
||||
***************************************
|
||||
|
||||
Large language models (LLMs) are trained on massive amounts of text data to generate coherent and fluent text. The
|
||||
underlying *transformer* architecture is the fundamental building block of all LLMs. Transformers
|
||||
enable LLMs to understand and generate text by capturing contextual relationships and long-range dependencies. To better
|
||||
understand the philosophy of the transformer architecture, review the foundational
|
||||
`Attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_ paper.
|
||||
|
||||
By further training pre-trained LLMs, the fine-tuned model can gain knowledge related to specific fields or tasks,
|
||||
thereby significantly improving its performance in that field or task. The core idea of fine-tuning is to use the
|
||||
parameters of the pre-trained model as the starting point for new tasks and shape it through a small amount of
|
||||
specific domain or task data, expanding the original model's capability to new tasks or datasets.
|
||||
|
||||
Fine-tuning can effectively improve the performance of existing pre-trained models in specific application scenarios.
|
||||
Continuous training and adjustment of the parameters of the base model in the target domain or task can better capture
|
||||
the semantic characteristics and patterns in specific scenarios, thereby significantly improving the key indicators of
|
||||
the model in that domain or task. For example, by fine-tuning the Llama 2 model, its performance in certain applications
|
||||
can be improve over the base model.
|
||||
|
||||
.. _fine-tuning-llms-concept-challenge:
|
||||
|
||||
The challenge of fine-tuning models
|
||||
===================================
|
||||
|
||||
However, the computational cost of fine-tuning is still high, especially for complex models and large datasets, which
|
||||
poses distinct challenges related to substantial computational and memory requirements. This might be a barrier for
|
||||
accelerators or GPUs with low computing power or limited device memory resources.
|
||||
|
||||
For example, suppose we have a language model with 7 billion (7B) parameters, represented by a weight matrix :math:`W`.
|
||||
During backpropagation, the model needs to learn a :math:`ΔW` matrix, which updates the original weights to minimize the
|
||||
value of the loss function.
|
||||
|
||||
The weight update is as follows: :math:`W_{updated} = W + ΔW`.
|
||||
|
||||
If the weight matrix :math:`W` contains 7B parameters, then the weight update matrix :math:`ΔW` should also
|
||||
contain 7B parameters. Therefore, the :math:`ΔW` calculation is computationally and memory intensive.
|
||||
|
||||
.. figure:: ../../../data/how-to/llm-fine-tuning-optimization/weight-update.png
|
||||
:alt: Weight update diagram
|
||||
|
||||
(a) Weight update in regular fine-tuning. (b) Weight update in LoRA where the product of matrix A (:math:`M\times K`)
|
||||
and matrix B (:math:`K\times N`) is :math:`ΔW(M\times N)`; dimension K is a hyperparameter. By representing
|
||||
:math:`ΔW` as the product of two smaller matrices (A and B) with a lower rank K, the number of trainable parameters
|
||||
is significantly reduced.
|
||||
|
||||
.. _fine-tuning-llms-concept-optimizations:
|
||||
|
||||
Optimizations for model fine-tuning
|
||||
===================================
|
||||
|
||||
Low-Rank Adaptation (LoRA) is a technique allowing fast and cost-effective fine-tuning of state-of-the-art LLMs that can
|
||||
overcome this issue of high memory consumption.
|
||||
|
||||
LoRA accelerates the adjustment process and reduces related memory costs. To be precise, LoRA decomposes the portion of
|
||||
weight changes :math:`ΔW` into high-precision low-rank representations, which do not require the calculations of all
|
||||
:math:`ΔW`. It learns the decomposition representation of :math:`ΔW` during training, as shown in
|
||||
the :ref:`weight update diagram <fine-tuning-llms-concept-challenge>`. This is how LoRA saves on
|
||||
computing resources.
|
||||
|
||||
LoRA is integrated into the `Hugging Face Parameter-Efficient Fine-Tuning (PEFT)
|
||||
<https://huggingface.co/docs/peft/en/index>`_ library, as well as other computation and memory efficiency optimization
|
||||
variants for model fine-tuning such as `AdaLoRA <https://huggingface.co/docs/peft/en/package_reference/adalora>`_. This
|
||||
library efficiently adapts large pre-trained models to various downstream applications without fine-tuning all model
|
||||
parameters. PEFT methods only fine-tune a few model parameters, significantly decreasing computational and storage
|
||||
costs while yielding performance comparable to a fully fine-tuned model. PEFT is integrated with the `Hugging Face
|
||||
Transformers <https://huggingface.co/docs/transformers/en/index>`_ library, providing a faster and easier way to load,
|
||||
train, and use large models for inference.
|
||||
|
||||
To simplify running a fine-tuning implementation, the `Transformer Reinforcement Learning (TRL)
|
||||
<https://huggingface.co/docs/trl/en/index>`_ library provides a set of tools to train transformer language models with
|
||||
reinforcement learning, from the Supervised Fine-Tuning step (SFT), Reward Modeling step (RM), to the Proximal Policy
|
||||
Optimization (PPO) step. The ``SFTTrainer`` API in TRL encapsulates these PEFT optimizations so you can easily import
|
||||
their custom training configuration and run the training process.
|
||||
|
||||
.. _fine-tuning-llms-walkthrough-desc:
|
||||
|
||||
Walkthrough
|
||||
===========
|
||||
|
||||
To demonstrate the benefits of LoRA and the ideal compute compatibility of using PEFT and TRL libraries on AMD
|
||||
ROCm-compatible accelerators and GPUs, let's step through a comprehensive implementation of the fine-tuning process
|
||||
using the Llama 2 7B model with LoRA tailored specifically for question-and-answer tasks on AMD MI300X accelerators.
|
||||
|
||||
Before starting, review and understand the key components of this walkthrough:
|
||||
|
||||
- `Llama 2 <https://huggingface.co/meta-llama>`_: a family of large language models developed and publicly released by
|
||||
Meta. Its variants range in scale from 7 billion to 70 billion parameters.
|
||||
|
||||
- Fine-tuning: a critical process that refines LLMs for specialized tasks and optimizes performance.
|
||||
|
||||
- LoRA: a memory-efficient implementation of LLM fine-tuning that significantly reduces the number of trainable
|
||||
parameters.
|
||||
|
||||
- `SFTTrainer <https://huggingface.co/docs/trl/v0.8.6/en/sft_trainer#supervised-fine-tuning-trainer>`_: an optimized
|
||||
trainer with a simple interface to easily fine-tune pre-trained models with PEFT adapters, for example, LoRA, for
|
||||
memory efficiency purposes on a custom dataset.
|
||||
|
||||
Continue the walkthrough in :doc:`Fine-tuning and inference <fine-tuning-and-inference>`.
|
||||
@@ -1,510 +0,0 @@
|
||||
.. meta::
|
||||
:description: Model fine-tuning and inference on a single-GPU system
|
||||
:keywords: ROCm, LLM, fine-tuning, usage, tutorial, single-GPU, LoRA, PEFT, inference, SFTTrainer
|
||||
|
||||
****************************************************
|
||||
Fine-tuning and inference using a single accelerator
|
||||
****************************************************
|
||||
|
||||
This section explains model fine-tuning and inference techniques on a single-accelerator system. See
|
||||
:doc:`Multi-accelerator fine-tuning <multi-gpu-fine-tuning-and-inference>` for a setup with multiple accelerators or
|
||||
GPUs.
|
||||
|
||||
.. _fine-tuning-llms-single-gpu-env:
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
|
||||
This section was tested using the following hardware and software environment.
|
||||
|
||||
.. list-table::
|
||||
:stub-columns: 1
|
||||
|
||||
* - Hardware
|
||||
- AMD Instinct MI300X accelerator
|
||||
|
||||
* - Software
|
||||
- ROCm 6.1, Ubuntu 22.04, PyTorch 2.1.2, Python 3.10
|
||||
|
||||
* - Libraries
|
||||
- ``transformers`` ``datasets`` ``huggingface-hub`` ``peft`` ``trl`` ``scipy``
|
||||
|
||||
* - Base model
|
||||
- ``meta-llama/Llama-2-7b-chat-hf``
|
||||
|
||||
.. _fine-tuning-llms-single-gpu-env-setup:
|
||||
|
||||
Setting up the base implementation environment
|
||||
----------------------------------------------
|
||||
|
||||
#. Install PyTorch for ROCm. Refer to the
|
||||
:doc:`PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`. For a consistent
|
||||
installation, it’s recommended to use official ROCm prebuilt Docker images with the framework pre-installed.
|
||||
|
||||
#. In the Docker container, check the availability of ROCm-capable accelerators using the following command.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
rocm-smi --showproductname
|
||||
|
||||
Your output should look like this:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
============================ ROCm System Management Interface ============================
|
||||
====================================== Product Info ======================================
|
||||
GPU[0] : Card series: AMD Instinct MI300X OAM
|
||||
GPU[0] : Card model: 0x74a1
|
||||
GPU[0] : Card vendor: Advanced Micro Devices, Inc. [AMD/ATI]
|
||||
GPU[0] : Card SKU: MI3SRIOV
|
||||
==========================================================================================
|
||||
================================== End of ROCm SMI Log ===================================
|
||||
|
||||
#. Check that your accelerators are available to PyTorch.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import torch
|
||||
print("Is a ROCm-GPU detected? ", torch.cuda.is_available())
|
||||
print("How many ROCm-GPUs are detected? ", torch.cuda.device_count())
|
||||
|
||||
If successful, your output should look like this:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
>>> print("Is a ROCm-GPU detected? ", torch.cuda.is_available())
|
||||
Is a ROCm-GPU detected? True
|
||||
>>> print("How many ROCm-GPUs are detected? ", torch.cuda.device_count())
|
||||
How many ROCm-GPUs are detected? 4
|
||||
|
||||
#. Install the required dependencies.
|
||||
|
||||
bitsandbytes is a library that facilitates quantization to improve the efficiency of deep learning models. Learn more
|
||||
about its use in :doc:`../inference-optimization/model-quantization`.
|
||||
|
||||
See the :ref:`Optimizations for model fine-tuning <fine-tuning-llms-concept-optimizations>` for a brief discussion on
|
||||
PEFT and TRL.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Install `bitsandbytes` for ROCm 6.0+.
|
||||
# Use -DBNB_ROCM_ARCH to target a specific GPU architecture.
|
||||
git clone --recurse https://github.com/ROCm/bitsandbytes.git
|
||||
cd bitsandbytes
|
||||
git checkout rocm_enabled_multi_backend
|
||||
pip install -r requirements-dev.txt
|
||||
cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S .
|
||||
python setup.py install
|
||||
|
||||
# To leverage the SFTTrainer in TRL for model fine-tuning.
|
||||
pip install trl
|
||||
|
||||
# To leverage PEFT for efficiently adapting pre-trained language models .
|
||||
pip install peft
|
||||
|
||||
# Install the other dependencies.
|
||||
pip install transformers datasets huggingface-hub scipy
|
||||
|
||||
#. Check that the required packages can be imported.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from transformers import (
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
TrainingArguments
|
||||
)
|
||||
from peft import LoraConfig
|
||||
from trl import SFTTrainer
|
||||
|
||||
.. _fine-tuning-llms-single-gpu-download-model-dataset:
|
||||
|
||||
Download the base model and fine-tuning dataset
|
||||
-----------------------------------------------
|
||||
|
||||
#. Request to access to download the `Meta's official Llama model <https://huggingface.co/meta-llama>`_ from Hugging
|
||||
Face. After permission is granted, log in with the following command using your personal access tokens:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
huggingface-cli login
|
||||
|
||||
.. note::
|
||||
|
||||
You can also use the `NousResearch Llama-2-7b-chat-hf <https://huggingface.co/NousResearch/Llama-2-7b-chat-hf>`_
|
||||
as a substitute. It has the same model weights as the original.
|
||||
|
||||
#. Run the following code to load the base model and tokenizer.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Base model and tokenizer names.
|
||||
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
|
||||
|
||||
# Load base model to GPU memory.
|
||||
device = "cuda:0"
|
||||
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code = True).to(device)
|
||||
|
||||
# Load tokenizer.
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
base_model_name,
|
||||
trust_remote_code = True)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
tokenizer.padding_side = "right"
|
||||
|
||||
#. Now, let's fine-tune the base model for a question-and-answer task using a small dataset called
|
||||
`mlabonne/guanaco-llama2-1k <https://huggingface.co/datasets/mlabonne/guanaco-llama2-1k>`_, which is a 1000 sample
|
||||
subset of the `timdettmers/openassistant-guanaco <https://huggingface.co/datasets/OpenAssistant/oasst1>`_ dataset.
|
||||
|
||||
.. code-block::
|
||||
|
||||
# Dataset for fine-tuning.
|
||||
training_dataset_name = "mlabonne/guanaco-llama2-1k"
|
||||
training_dataset = load_dataset(training_dataset_name, split = "train")
|
||||
|
||||
# Check the data.
|
||||
print(training_dataset)
|
||||
|
||||
# Dataset 11 is a QA sample in English.
|
||||
print(training_dataset[11])
|
||||
|
||||
#. With the base model and the dataset, let's start fine-tuning!
|
||||
|
||||
.. _fine-tuning-llms-single-gpu-configure-params:
|
||||
|
||||
Configure fine-tuning parameters
|
||||
--------------------------------
|
||||
|
||||
To set up ``SFTTrainer`` parameters, you can use the following code as reference.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Training parameters for SFTTrainer.
|
||||
training_arguments = TrainingArguments(
|
||||
output_dir = "./results",
|
||||
num_train_epochs = 1,
|
||||
per_device_train_batch_size = 4,
|
||||
gradient_accumulation_steps = 1,
|
||||
optim = "paged_adamw_32bit",
|
||||
save_steps = 50,
|
||||
logging_steps = 50,
|
||||
learning_rate = 4e-5,
|
||||
weight_decay = 0.001,
|
||||
fp16=False,
|
||||
bf16=False,
|
||||
max_grad_norm = 0.3,
|
||||
max_steps = -1,
|
||||
warmup_ratio = 0.03,
|
||||
group_by_length = True,
|
||||
lr_scheduler_type = "constant",
|
||||
report_to = "tensorboard"
|
||||
)
|
||||
|
||||
.. _fine-tuning-llms-single-gpu-start:
|
||||
|
||||
Fine-tuning
|
||||
===========
|
||||
|
||||
In this section, you'll see two ways of training: with the LoRA technique and without. See :ref:`Optimizations for model
|
||||
fine-tuning <fine-tuning-llms-concept-optimizations>` for an introduction to LoRA. Training with LoRA uses the
|
||||
``SFTTrainer`` API with its PEFT integration. Training without LoRA forgoes these benefits.
|
||||
|
||||
Compare the number of trainable parameters and training time under the two different methodologies.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Fine-tuning with LoRA and PEFT
|
||||
:sync: with
|
||||
|
||||
1. Configure LoRA using the following code snippet.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
peft_config = LoraConfig(
|
||||
lora_alpha = 16,
|
||||
lora_dropout = 0.1,
|
||||
r = 64,
|
||||
bias = "none",
|
||||
task_type = "CAUSAL_LM"
|
||||
)
|
||||
# View the number of trainable parameters.
|
||||
from peft import get_peft_model
|
||||
peft_model = get_peft_model(base_model, peft_config)
|
||||
peft_model.print_trainable_parameters()
|
||||
|
||||
The output should look like this. Compare the number of trainable parameters to that when fine-tuning without
|
||||
LoRA and PEFT.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
trainable params: 33,554,432 || all params: 6,771,970,048 || trainable%: 0.49548996469513035
|
||||
|
||||
2. Initialize ``SFTTrainer`` with a PEFT LoRA configuration and run the trainer.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Initialize an SFT trainer.
|
||||
sft_trainer = SFTTrainer(
|
||||
model = base_model,
|
||||
train_dataset = training_dataset,
|
||||
peft_config = peft_config,
|
||||
dataset_text_field = "text",
|
||||
tokenizer = tokenizer,
|
||||
args = training_arguments
|
||||
)
|
||||
|
||||
# Run the trainer.
|
||||
sft_trainer.train()
|
||||
|
||||
The output should look like this:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
{'loss': 1.5973, 'grad_norm': 0.25271978974342346, 'learning_rate': 4e-05, 'epoch': 0.16}
|
||||
{'loss': 2.0519, 'grad_norm': 0.21817368268966675, 'learning_rate': 4e-05, 'epoch': 0.32}
|
||||
{'loss': 1.6147, 'grad_norm': 0.3046981394290924, 'learning_rate': 4e-05, 'epoch': 0.48}
|
||||
{'loss': 1.4124, 'grad_norm': 0.11534837633371353, 'learning_rate': 4e-05, 'epoch': 0.64}
|
||||
{'loss': 1.5627, 'grad_norm': 0.09108350425958633, 'learning_rate': 4e-05, 'epoch': 0.8}
|
||||
{'loss': 1.417, 'grad_norm': 0.2536439299583435, 'learning_rate': 4e-05, 'epoch': 0.96}
|
||||
{'train_runtime': 197.4947, 'train_samples_per_second': 5.063, 'train_steps_per_second': 0.633, 'train_loss': 1.6194254455566406, 'epoch': 1.0}
|
||||
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 125/125 [03:17<00:00, 1.58s/it]
|
||||
|
||||
.. tab-item:: Fine-tuning without LoRA and PEFT
|
||||
:sync: without
|
||||
|
||||
1. Use the following code to get started.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def print_trainable_parameters(model):
|
||||
# Prints the number of trainable parameters in the model.
|
||||
trainable_params = 0
|
||||
all_param = 0
|
||||
for _, param in model.named_parameters():
|
||||
all_param += param.numel()
|
||||
if param.requires_grad:
|
||||
trainable_params += param.numel()
|
||||
print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}")
|
||||
|
||||
sft_trainer.peft_config = None
|
||||
print_trainable_parameters(sft_trainer.model)
|
||||
|
||||
The output should look like this. Compare the number of trainable parameters to that when fine-tuning with LoRA
|
||||
and PEFT.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
trainable params: 6,738,415,616 || all params: 6,738,415,616 || trainable%: 100.00
|
||||
|
||||
|
||||
2. Run the trainer.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Trainer without LoRA config.
|
||||
trainer_full = SFTTrainer(
|
||||
model = base_model,
|
||||
train_dataset = training_dataset,
|
||||
dataset_text_field = "text",
|
||||
tokenizer = tokenizer,
|
||||
args = training_arguments
|
||||
)
|
||||
|
||||
# Training.
|
||||
trainer_full.train()
|
||||
|
||||
The output should look like this:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
{'loss': 1.5975, 'grad_norm': 0.25113457441329956, 'learning_rate': 4e-05, 'epoch': 0.16}
|
||||
{'loss': 2.0524, 'grad_norm': 0.2180655151605606, 'learning_rate': 4e-05, 'epoch': 0.32}
|
||||
{'loss': 1.6145, 'grad_norm': 0.2949850261211395, 'learning_rate': 4e-05, 'epoch': 0.48}
|
||||
{'loss': 1.4118, 'grad_norm': 0.11036080121994019, 'learning_rate': 4e-05, 'epoch': 0.64}
|
||||
{'loss': 1.5595, 'grad_norm': 0.08962831646203995, 'learning_rate': 4e-05, 'epoch': 0.8}
|
||||
{'loss': 1.4119, 'grad_norm': 0.25422757863998413, 'learning_rate': 4e-05, 'epoch': 0.96}
|
||||
{'train_runtime': 419.5154, 'train_samples_per_second': 2.384, 'train_steps_per_second': 0.298, 'train_loss': 1.6171623611450194, 'epoch': 1.0}
|
||||
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 125/125 [06:59<00:00, 3.36s/it]
|
||||
|
||||
.. _fine-tuning-llms-single-gpu-saving:
|
||||
|
||||
Saving adapters or fully fine-tuned models
|
||||
------------------------------------------
|
||||
|
||||
PEFT methods freeze the pre-trained model parameters during fine-tuning and add a smaller number of trainable
|
||||
parameters, namely the adapters, on top of it. The adapters are trained to learn specific task information. The adapters
|
||||
trained with PEFT are usually an order of magnitude smaller than the full base model, making them convenient to share,
|
||||
store, and load.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Saving a PEFT adapter
|
||||
:sync: with
|
||||
|
||||
If you're using LoRA and PEFT, use the following code to save a PEFT adapter to your system once the fine-tuning
|
||||
is completed.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# PEFT adapter name.
|
||||
adapter_name = "llama-2-7b-enhanced-adapter"
|
||||
|
||||
# Save PEFT adapter.
|
||||
sft_trainer.model.save_pretrained(adapter_name)
|
||||
|
||||
The saved PEFT adapter should look like this on your system:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Access adapter directory.
|
||||
cd llama-2-7b-enhanced-adapter
|
||||
|
||||
# List all adapter files.
|
||||
README.md adapter_config.json adapter_model.safetensors
|
||||
|
||||
.. tab-item:: Saving a fully fine-tuned model
|
||||
:sync: without
|
||||
|
||||
If you're not using LoRA and PEFT so there is no PEFT LoRA configuration used for training, use the following code
|
||||
to save your fine-tuned model to your system.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Fully fine-tuned model name.
|
||||
new_model_name = "llama-2-7b-enhanced"
|
||||
|
||||
# Save the fully fine-tuned model.
|
||||
full_trainer.model.save_pretrained(new_model_name)
|
||||
|
||||
The saved new full model should look like this on your system:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Access new model directory.
|
||||
cd llama-2-7b-enhanced
|
||||
|
||||
# List all model files.
|
||||
config.json model-00002-of-00006.safetensors model-00005-of-00006.safetensors
|
||||
generation_config.json model-00003-of-00006.safetensors model-00006-of-00006.safetensors
|
||||
model-00001-of-00006.safetensors model-00004-of-00006.safetensors model.safetensors.index.json
|
||||
|
||||
.. note::
|
||||
|
||||
PEFT adapters can’t be loaded by ``AutoModelForCausalLM`` from the Transformers library as they do not contain
|
||||
full model parameters and model configurations, for example, ``config.json``. To use it as a normal transformer
|
||||
model, you need to merge them into the base model.
|
||||
|
||||
Basic model inference
|
||||
=====================
|
||||
|
||||
A trained model can be classified into one of three types:
|
||||
|
||||
* A PEFT adapter
|
||||
|
||||
* A pre-trained language model in Hugging Face
|
||||
|
||||
* A fully fine-tuned model not using PEFT
|
||||
|
||||
Let's look at achieving model inference using these types of models.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Inference using PEFT adapters
|
||||
|
||||
To use PEFT adapters like a normal transformer model, you can run the generation by loading a base model along with PEFT
|
||||
adapters as follows.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from peft import PeftModel
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
# Set the path of the model or the name on Hugging face hub
|
||||
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
|
||||
|
||||
# Set the path of the adapter
|
||||
adapter_name = "Llama-2-7b-enhanced-adpater"
|
||||
|
||||
# Load base model
|
||||
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
|
||||
|
||||
# Adapt the base model with the adapter
|
||||
new_model = PeftModel.from_pretrained(base_model, adapter_name)
|
||||
|
||||
# Then, run generation as the same with a normal model outlined in 2.1
|
||||
|
||||
The PEFT library provides a ``merge_and_unload`` method, which merges the adapter layers into the base model. This is
|
||||
needed if someone wants to save the adapted model into local storage and use it as a normal standalone model.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Load base model
|
||||
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
|
||||
|
||||
# Adapt the base model with the adapter
|
||||
new_model = PeftModel.from_pretrained(base_model, adapter_name)
|
||||
|
||||
# Merge adapter
|
||||
model = model.merge_and_unload()
|
||||
|
||||
# Save the merged model into local
|
||||
model.save_pretrained("merged_adpaters")
|
||||
|
||||
.. tab-item:: Inference using pre-trained or fully fine-tuned models
|
||||
|
||||
If you have a fully fine-tuned model not using PEFT, you can load it like any other pre-trained language model in
|
||||
`Hugging Face Hub <https://huggingface.co/docs/hub/en/index>`_ using the `Transformers
|
||||
<https://huggingface.co/docs/transformers/en/index>`_ library.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Import relevant class for loading model and tokenizer
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
# Set the pre-trained model name on Hugging face hub
|
||||
model_name = "meta-llama/Llama-2-7b-chat-hf"
|
||||
|
||||
# Set device type
|
||||
device = "cuda:0"
|
||||
|
||||
# Load model and tokenizer
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
# Input prompt encoding
|
||||
query = "What is a large language model?"
|
||||
inputs = tokenizer.encode(query, return_tensors="pt").to(device)
|
||||
|
||||
# Token generation
|
||||
outputs = model.generate(inputs)
|
||||
|
||||
# Outputs decoding
|
||||
print(tokenizer.decode(outputs[0]))
|
||||
|
||||
In addition, pipelines from Transformers offer simple APIs to use pre-trained models for different tasks, including
|
||||
sentiment analysis, feature extraction, question answering and so on. You can use the pipeline abstraction to achieve
|
||||
model inference easily.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Import relevant class for loading model and tokenizer
|
||||
from transformers import pipeline
|
||||
|
||||
# Set the path of your model or the name on Hugging face hub
|
||||
model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
|
||||
|
||||
# Set pipeline
|
||||
# A positive device value will run the model on associated CUDA device id
|
||||
pipe = pipeline("text-generation", model=model_name_or_path, device=0)
|
||||
|
||||
# Token generation
|
||||
print(pipe("What is a large language model?")[0]["generated_text"])
|
||||
|
||||
If using multiple accelerators, see
|
||||
:ref:`Multi-accelerator fine-tuning and inference <fine-tuning-llms-multi-gpu-hugging-face-accelerate>` to explore
|
||||
popular libraries that simplify fine-tuning and inference in a multi-accelerator system.
|
||||
|
||||
Read more about inference frameworks like vLLM and Hugging Face TGI in
|
||||
:doc:`LLM inference frameworks <../inference/llm-inference-frameworks>`.
|
||||
@@ -1,30 +0,0 @@
|
||||
.. meta::
|
||||
:description: Learn how to use ROCm for AI.
|
||||
:keywords: ROCm, AI, machine learning, LLM, usage, tutorial
|
||||
|
||||
**************************
|
||||
Use ROCm for AI
|
||||
**************************
|
||||
|
||||
ROCm is an open-source software platform that enables high-performance computing and machine learning applications. It features the ability to accelerate training, fine-tuning, and inference for AI application development. With ROCm, you can access the full power of AMD GPUs, which can significantly improve the performance and efficiency of AI workloads.
|
||||
|
||||
You can use ROCm to perform distributed training, which enables you to train models across multiple GPUs or nodes simultaneously. Additionally, ROCm supports mixed-precision training, which can help reduce the memory and compute requirements of training workloads. For fine-tuning, ROCm provides access to various algorithms and optimization techniques. In terms of inference, ROCm provides several techniques that can help you optimize your models for deployment, such as quantization, GEMM tuning, and optimization with composable kernel.
|
||||
|
||||
Overall, ROCm can be used to improve the performance and efficiency of your AI applications. With its training, fine-tuning, and inference support, ROCm provides a complete solution for optimizing AI workflows and achieving the optimum results possible on AMD GPUs.
|
||||
|
||||
The AI Developer Hub contains `AMD ROCm tutorials <https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/>`_ for
|
||||
training, fine-tuning, and inference. It leverages popular machine learning frameworks on AMD GPUs.
|
||||
|
||||
In this guide, you'll learn how to use ROCm for AI:
|
||||
|
||||
- :doc:`Training <training/index>`
|
||||
|
||||
- :doc:`Fine-tuning LLMs <fine-tuning/index>`
|
||||
|
||||
- :doc:`Inference <inference/index>`
|
||||
|
||||
- :doc:`Inference optimization <inference-optimization/index>`
|
||||
|
||||
|
||||
To learn about ROCm for HPC applications and scientific computing, see
|
||||
:doc:`../rocm-for-hpc/index`.
|
||||
@@ -1,36 +0,0 @@
|
||||
.. meta::
|
||||
:description: How to Use ROCm for AI inference optimization
|
||||
:keywords: ROCm, LLM, AI inference, Optimization, GPUs, usage, tutorial
|
||||
|
||||
*******************************************
|
||||
Use ROCm for AI inference optimization
|
||||
*******************************************
|
||||
|
||||
AI inference optimization is the process of improving the performance of machine learning models and speeding up the inference process. It includes:
|
||||
|
||||
- **Quantization**: This involves reducing the precision of model weights and activations while maintaining acceptable accuracy levels. Reduced precision improves inference efficiency because lower precision data requires less storage and better utilizes the hardware's computation power.
|
||||
|
||||
- **Kernel optimization**: This technique involves optimizing computation kernels to exploit the underlying hardware capabilities. For example, the kernels can be optimized to use multiple GPU cores or utilize specialized hardware like tensor cores to accelerate the computations.
|
||||
|
||||
- **Libraries**: Libraries such as Flash Attention, xFormers, and PyTorch TunableOp are used to accelerate deep learning models and improve the performance of inference workloads.
|
||||
|
||||
- **Hardware acceleration**: Hardware acceleration techniques, like GPUs for AI inference, can significantly improve performance due to their parallel processing capabilities.
|
||||
|
||||
- **Pruning**: This involves removing unnecessary connections, layers, or weights from a pre-trained model while maintaining acceptable accuracy levels, resulting in a smaller model that requires fewer computational resources to run inference.
|
||||
|
||||
Utilizing these optimization techniques with the ROCm™ software platform can significantly reduce inference time, improve performance, and reduce the cost of your AI applications.
|
||||
|
||||
Throughout the following topics, this guide discusses optimization techniques for inference workloads.
|
||||
|
||||
- :doc:`Model quantization <model-quantization>`
|
||||
|
||||
- :doc:`Model acceleration libraries <model-acceleration-libraries>`
|
||||
|
||||
- :doc:`Optimizing with Composable Kernel <optimizing-with-composable-kernel>`
|
||||
|
||||
- :doc:`Optimizing Triton kernels <optimizing-triton-kernel>`
|
||||
|
||||
- :doc:`Profiling and debugging <profiling-and-debugging>`
|
||||
|
||||
- :doc:`Workload tuning <workload>`
|
||||
|
||||
@@ -1,537 +0,0 @@
|
||||
.. meta::
|
||||
:description: How to use model acceleration techniques and libraries to improve memory efficiency and performance.
|
||||
:keywords: ROCm, LLM, fine-tuning, usage, tutorial, Flash Attention, Hugging Face, xFormers, vLLM, PyTorch
|
||||
|
||||
****************************
|
||||
Model acceleration libraries
|
||||
****************************
|
||||
|
||||
This section discusses model acceleration techniques and libraries to improve memory efficiency and performance.
|
||||
|
||||
.. _acceleration-flash-attention:
|
||||
|
||||
Flash Attention 2
|
||||
=================
|
||||
|
||||
Flash Attention is a technique designed to reduce memory movements between GPU SRAM and high-bandwidth memory (HBM). By
|
||||
using a tiling approach, Flash Attention 2 improves memory locality in the nested loops of query, key, and value
|
||||
computations within the Attention modules of LLMs. These modules include Multi-Head Attention (MHA), Group-Query
|
||||
Attention (GQA), and Multi-Query Attention (MQA). This reduction in memory movements significantly decreases the
|
||||
time-to-first-token (TTFT) latency for large batch sizes and long prompt sequences, thereby enhancing overall
|
||||
performance.
|
||||
|
||||
.. image:: ../../../data/how-to/llm-fine-tuning-optimization/attention-module.png
|
||||
:alt: Attention module of a large language module utilizing tiling
|
||||
:align: center
|
||||
|
||||
Installing Flash Attention 2
|
||||
----------------------------
|
||||
|
||||
ROCm provides two different implementations of Flash Attention 2 modules. They can be deployed interchangeably:
|
||||
|
||||
* ROCm `Composable Kernel <https://github.com/ROCm/composable_kernel/tree/develop/example/01_gemm>`_
|
||||
(CK) Flash Attention 2
|
||||
|
||||
* `OpenAI Triton <https://triton-lang.org/main/index.html>`_ Flash Attention 2
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: CK Flash Attention 2
|
||||
|
||||
To install CK Flash Attention 2, use the following commands.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Install from source
|
||||
git clone https://github.com/ROCm/flash-attention.git
|
||||
cd flash-attention/
|
||||
GPU_ARCHS=gfx942 python setup.py install #MI300 series
|
||||
|
||||
Hugging Face Transformers can easily deploy the CK Flash Attention 2 module by passing an argument
|
||||
``attn_implementation="flash_attention_2"`` in the ``from_pretrained`` class.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
model_name = "NousResearch/Meta-Llama-3-8B"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype=torch.float16, use_fast=False)
|
||||
inputs = tokenizer('Today is', return_tensors='pt').to(device)
|
||||
|
||||
model_eager = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, attn_implementation="eager").cuda(device)
|
||||
model_ckFAv2 = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, attn_implementation="flash_attention_2").cuda(device)
|
||||
|
||||
print("eager GQA: ", tokenizer.decode(model_eager.generate(**inputs, max_new_tokens=10)[0], skip_special_tokens=True))
|
||||
print("ckFAv2 GQA: ", tokenizer.decode(model_ckFAv2.generate(**inputs, max_new_tokens=10)[0], skip_special_tokens=True))
|
||||
|
||||
# eager GQA: Today is the day of the Lord, and we are the
|
||||
# ckFAv2 GQA: Today is the day of the Lord, and we are the
|
||||
|
||||
.. tab-item:: Triton Flash Attention 2
|
||||
|
||||
The Triton Flash Attention 2 module is implemented in Python and uses OpenAI’s JIT compiler. This module has been
|
||||
upstreamed into the vLLM serving toolkit, discussed in :doc:'llm-inference-frameworks'.
|
||||
|
||||
1. To install Triton Flash Attention 2 and run the benchmark, use the following commands.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Install from the source
|
||||
pip uninstall pytorch-triton-rocm triton -y
|
||||
git clone https://github.com/ROCm/triton.git
|
||||
cd triton/python
|
||||
GPU_ARCHS=gfx942 python setup.py install #MI300 series
|
||||
pip install matplotlib pandas
|
||||
|
||||
2. To test, run the Triton Flash Attention 2 performance benchmark.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Test the triton FA v2 kernel
|
||||
python https://github.com/ROCm/triton/blob/triton-mlir/python/perf-kernels/flash-attention.py
|
||||
# Results (Okay to release TFLOPS number ???)
|
||||
fused-attention-fwd-d128:
|
||||
BATCH HQ HK N_CTX_Q N_CTX_K TFLOPS
|
||||
0 16.0 16.0 16.0 1024.0 1024.0 287.528411
|
||||
1 8.0 16.0 16.0 2048.0 2048.0 287.490806
|
||||
2 4.0 16.0 16.0 4096.0 4096.0 345.966031
|
||||
3 2.0 16.0 16.0 8192.0 8192.0 361.369510
|
||||
4 1.0 16.0 16.0 16384.0 16384.0 356.873720
|
||||
5 2.0 48.0 48.0 1024.0 1024.0 216.916235
|
||||
6 2.0 48.0 48.0 2048.0 1024.0 271.027578
|
||||
7 2.0 48.0 48.0 4096.0 8192.0 337.367372
|
||||
8 2.0 48.0 48.0 8192.0 4096.0 363.481649
|
||||
9 2.0 48.0 48.0 16384.0 8192.0 375.013622
|
||||
10 8.0 16.0 16.0 1989.0 15344.0 321.791333
|
||||
11 4.0 16.0 16.0 4097.0 163.0 122.104888
|
||||
12 2.0 16.0 16.0 8122.0 2159.0 337.060283
|
||||
13 1.0 16.0 16.0 16281.0 7.0 5.234012
|
||||
14 2.0 48.0 48.0 1021.0 1020.0 214.657425
|
||||
15 2.0 48.0 48.0 2001.0 2048.0 314.429118
|
||||
16 2.0 48.0 48.0 3996.0 9639.0 330.411368
|
||||
17 2.0 48.0 48.0 8181.0 1021.0 324.614980
|
||||
|
||||
xFormers
|
||||
========
|
||||
|
||||
xFormers also improves the performance of attention modules. Although xFormers attention performs very
|
||||
similarly to Flash Attention 2 due to its tiling behavior of query, key, and value, it’s widely used for LLMs and
|
||||
Stable Diffusion models with the Hugging Face Diffusers library.
|
||||
|
||||
Installing CK xFormers
|
||||
----------------------
|
||||
|
||||
Use the following commands to install CK xFormers.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Install from source
|
||||
git clone https://github.com/ROCm/xformers.git
|
||||
cd xformers/
|
||||
git submodule update --init --recursive
|
||||
PYTORCH_ROCM_ARCH=gfx942 python setup.py install #Instinct MI300-series
|
||||
|
||||
PyTorch built-in acceleration
|
||||
=============================
|
||||
|
||||
`PyTorch compilation
|
||||
mode <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`__
|
||||
synthesizes the model into a graph and then lowers it to prime
|
||||
operators. These operators are compiled using TorchInductor, which uses
|
||||
OpenAI Triton as a building block for GPU acceleration. One advantage of
|
||||
PyTorch compilation mode is that its GPU kernels are written in Python,
|
||||
making modifying and extending them easier. PyTorch compilation mode
|
||||
often delivers higher performance, as model operations are fused before
|
||||
runtime, which allows for easy deployment of high-performance kernels.
|
||||
|
||||
PyTorch compilation
|
||||
-------------------
|
||||
|
||||
To utilize the PyTorch compilation mode, specific layers of the model
|
||||
must be explicitly assigned as compilation targets. In the case of LLM,
|
||||
where autoregressive token decoding generates dynamically changing
|
||||
key/value sizes, limiting the key/value size to a static dimension,
|
||||
``max_cache_length``, is necessary to utilize the performance benefits
|
||||
of the PyTorch compilation.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Sample script to run LLM with the static key-value cache and PyTorch compilation
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache
|
||||
import torch
|
||||
from typing import Optional
|
||||
import os
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
model_name = "NousResearch/Meta-Llama-3-8B"
|
||||
prompts = []
|
||||
|
||||
for b in range(1):
|
||||
prompts.append("New york city is where "
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device).eval()
|
||||
inputs = tokenizer(prompts, return_tensors="pt").to(model.device)
|
||||
|
||||
def decode_one_tokens(model, cur_token, input_pos, cache_position):
|
||||
logits = model(cur_token, position_ids=input_pos, cache_position=cache_position, return_dict=False, use_cache=True)[0]
|
||||
new_token = torch.argmax(logits[:, -1], dim=-1)[:, None]
|
||||
return new_token
|
||||
|
||||
batch_size, seq_length = inputs["input_ids"].shape
|
||||
|
||||
# Static key-value cache
|
||||
max_cache_length = 1024
|
||||
max_new_tokens = 10
|
||||
model._setup_cache(StaticCache, batch_size, max_cache_len=max_cache_length)
|
||||
cache_position = torch.arange(seq_length, device=device)
|
||||
generated_ids = torch.zeros(batch_size, seq_length + max_new_tokens + 1, dtype=torch.int, device=device)
|
||||
generated_ids[:, cache_position] = inputs["input_ids"].to(device).to(torch.int)
|
||||
|
||||
logits = model(**inputs, cache_position=cache_position, return_dict=False, use_cache=True)[0]
|
||||
next_token = torch.argmax(logits[:, -1], dim=-1)[:, None]
|
||||
|
||||
# torch compilation
|
||||
decode_one_tokens = torch.compile(decode_one_tokens, mode="max-autotune-no-cudagraphs",fullgraph=True)
|
||||
|
||||
generated_ids[:, seq_length] = next_token[:, 0]
|
||||
cache_position = torch.tensor([seq_length + 1], device=device)
|
||||
|
||||
with torch.no_grad():
|
||||
for _ in range(1, max_new_tokens):
|
||||
with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
|
||||
next_token = decode_one_tokens(model, next_token.clone(), None, cache_position)
|
||||
generated_ids[:, cache_position] = next_token.int()
|
||||
cache_position += 1
|
||||
|
||||
.. _fine-tuning-llms-pytorch-tunableop:
|
||||
|
||||
PyTorch TunableOp
|
||||
------------------
|
||||
|
||||
ROCm PyTorch (2.2.0 and later) allows users to use high-performance ROCm
|
||||
GEMM kernel libraries through PyTorch's built-in TunableOp options.
|
||||
This enables users to automatically pick up the best-performing GEMM
|
||||
kernels from :doc:`rocBLAS <rocblas:index>` and :doc:`hipBLASLt <hipblaslt:index>` libraries during runtime.
|
||||
|
||||
During warm-up runs or offline profiling steps, users can create a GEMM Table
|
||||
that enumerates the kernel information. During the model's run, the best-performing kernel substitutes
|
||||
``torch.nn.functional.linear(input, weight, bias=None)`` with the kernel specified in the GEMM table. The
|
||||
`Tunable GitHub <https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/cuda/tunable/README.md>`_
|
||||
page describes the options.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# To turn on TunableOp, simply set this environment variable
|
||||
export PYTORCH_TUNABLEOP_ENABLED=1
|
||||
|
||||
# Python
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
A = torch.rand(100, 20, device="cuda")
|
||||
W = torch.rand(200, 20, device="cuda")
|
||||
Out = F.linear(A, W)
|
||||
print(Out.size())
|
||||
|
||||
# tunableop_results0.csv
|
||||
Validator,PT_VERSION,2.4.0
|
||||
Validator,ROCM_VERSION,6.1.0.0-82-5fabb4c
|
||||
Validator,HIPBLASLT_VERSION,0.7.0-1549b021
|
||||
Validator,GCN_ARCH_NAME,gfx942:sramecc+:xnack-
|
||||
Validator,ROCBLAS_VERSION,4.1.0-cefa4a9b-dirty
|
||||
GemmTunableOp_float_TN,tn_200_100_20,Gemm_Rocblas_32323,0.00669595
|
||||
|
||||
.. image:: ../../../data/how-to/llm-fine-tuning-optimization/tunableop.png
|
||||
:alt: GEMM and TunableOp
|
||||
:align: center
|
||||
|
||||
Learn more about optimizing kernels with TunableOp in
|
||||
:ref:`Optimizing Triton kernels <mi300x-tunableop>`.
|
||||
|
||||
|
||||
FBGEMM and FBGEMM_GPU
|
||||
=====================
|
||||
|
||||
FBGEMM (Facebook General Matrix Multiplication) is a low-precision, high-performance CPU kernel library
|
||||
for matrix-matrix multiplications and convolutions. It is used for server-side inference
|
||||
and as a back end for PyTorch quantized operators. FBGEMM offers optimized on-CPU performance for reduced precision calculations,
|
||||
strong performance on native tensor formats, and the ability to generate
|
||||
high-performance shape- and size-specific kernels at runtime.
|
||||
|
||||
FBGEMM_GPU collects several high-performance PyTorch GPU operator libraries
|
||||
for use in training and inference. It provides efficient table-batched embedding functionality,
|
||||
data layout transformation, and quantization support.
|
||||
|
||||
For more information about FBGEMM and FBGEMM_GPU, see the `PyTorch FBGEMM GitHub <https://github.com/pytorch/FBGEMM>`_
|
||||
and the `PyTorch FBGEMM documentation <https://pytorch.org/FBGEMM/>`_.
|
||||
The `Meta blog post about FBGEMM <https://engineering.fb.com/2018/11/07/ml-applications/fbgemm/>`_ provides
|
||||
additional background about the library.
|
||||
|
||||
Installing FBGEMM_GPU
|
||||
----------------------
|
||||
|
||||
Installing FBGEMM_GPU consists of the following steps:
|
||||
|
||||
* Set up an isolated Miniconda environment
|
||||
* Install ROCm using Docker or the :doc:`package manager <rocm-install-on-linux:install/install-methods/package-manager-index>`
|
||||
* Install the nightly `PyTorch <https://pytorch.org/>`_ build
|
||||
* Complete the pre-build and build tasks
|
||||
|
||||
.. note::
|
||||
|
||||
FBGEMM_GPU doesn't require the installation of FBGEMM. To optionally install
|
||||
FBGEMM, see the `FBGEMM install instructions <https://pytorch.org/FBGEMM/fbgemm/development/BuildInstructions.html>`_.
|
||||
|
||||
Set up the Miniconda environment
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
To install Miniconda, use the following commands.
|
||||
|
||||
#. Install a `Miniconda environment <https://docs.anaconda.com/miniconda/>`_ for reproducible builds.
|
||||
All subsequent commands run inside this environment.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export PLATFORM_NAME="$(uname -s)-$(uname -m)"
|
||||
|
||||
# Set the Miniconda prefix directory
|
||||
miniconda_prefix=$HOME/miniconda
|
||||
|
||||
# Download the Miniconda installer
|
||||
wget -q "https://repo.anaconda.com/miniconda/Miniconda3-latest-${PLATFORM_NAME}.sh" -O miniconda.sh
|
||||
|
||||
# Run the installer
|
||||
bash miniconda.sh -b -p "$miniconda_prefix" -u
|
||||
|
||||
# Load the shortcuts
|
||||
. ~/.bashrc
|
||||
|
||||
# Run updates
|
||||
conda update -n base -c defaults -y conda
|
||||
|
||||
#. Create a Miniconda environment with Python 3.12:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
env_name=<ENV NAME>
|
||||
python_version=3.12
|
||||
|
||||
# Create the environment
|
||||
conda create -y --name ${env_name} python="${python_version}"
|
||||
|
||||
# Upgrade PIP and pyOpenSSL package
|
||||
conda run -n ${env_name} pip install --upgrade pip
|
||||
conda run -n ${env_name} python -m pip install pyOpenSSL>22.1.0
|
||||
|
||||
#. Install additional build tools:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
conda install -n ${env_name} -y \
|
||||
click \
|
||||
cmake \
|
||||
hypothesis \
|
||||
jinja2 \
|
||||
make \
|
||||
ncurses \
|
||||
ninja \
|
||||
numpy \
|
||||
scikit-build \
|
||||
wheel
|
||||
|
||||
Install the ROCm components
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
FBGEMM_GPU can run in a ROCm Docker container or in conjunction with the full ROCm installation.
|
||||
The Docker method is recommended because it requires fewer steps and provides a stable environment.
|
||||
|
||||
To run FBGEMM_GPU in the Docker container, pull the `Minimal Docker image for ROCm <https://hub.docker.com/r/rocm/rocm-terminal>`_.
|
||||
This image includes all preinstalled ROCm packages required to integrate FBGEMM. To pull
|
||||
and run the ROCm Docker image, use this command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Run for ROCm 6.2.0
|
||||
docker run -it --network=host --shm-size 16G --device=/dev/kfd --device=/dev/dri --group-add video \
|
||||
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ipc=host rocm/rocm-terminal:6.2 /bin/bash
|
||||
|
||||
.. note::
|
||||
|
||||
The `Full Docker image for ROCm <https://hub.docker.com/r/rocm/dev-ubuntu-20.04>`_, which includes all
|
||||
ROCm packages, can also be used. However, it results in a very large container, so the minimal
|
||||
Docker image is recommended.
|
||||
|
||||
You can also install ROCm using the package manager. FBGEMM_GPU requires the installation of the full ROCm package.
|
||||
For more information, see :doc:`the ROCm installation guide <rocm-install-on-linux:install/detailed-install>`.
|
||||
The ROCm package also requires the :doc:`MIOpen <miopen:index>` component as a dependency.
|
||||
To install MIOpen, use the ``apt install`` command.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
apt install hipify-clang miopen-hip miopen-hip-dev
|
||||
|
||||
Install PyTorch
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Install `PyTorch <https://pytorch.org/>`_ using ``pip`` for the most reliable and consistent results.
|
||||
|
||||
#. Install the nightly PyTorch build using ``pip``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Install the latest nightly, ROCm variant
|
||||
conda run -n ${env_name} pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.2/
|
||||
|
||||
#. Ensure PyTorch loads correctly. Verify the version and variant of the installation using an ``import`` test.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Ensure that the package loads properly
|
||||
conda run -n ${env_name} python -c "import torch.distributed"
|
||||
|
||||
# Verify the version and variant of the installation
|
||||
conda run -n ${env_name} python -c "import torch; print(torch.__version__)"
|
||||
|
||||
Perform the prebuild and build
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
#. Clone the FBGEMM repository and the relevant submodules. Use ``pip`` to install the
|
||||
components in ``requirements.txt``. Run the following commands inside the Miniconda environment.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Select a version tag
|
||||
FBGEMM_VERSION=v0.8.0
|
||||
|
||||
# Clone the repo along with its submodules
|
||||
git clone https://github.com/pytorch/FBGEMM.git --branch=v0.8.0 --recursive fbgemm_${FBGEMM_VERSION}
|
||||
|
||||
# Install additional required packages for building and testing
|
||||
cd fbgemm_${FBGEMM_VERSION}/fbgemm_gpu
|
||||
pip install requirements.txt
|
||||
|
||||
#. Clear the build cache to remove stale build information.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
|
||||
|
||||
python setup.py clean
|
||||
|
||||
#. Set the wheel build variables, including the package name, Python version tag, and Python platform name.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the package name depending on the build variant
|
||||
export package_name=fbgemm_gpu_rocm
|
||||
|
||||
# Set the Python version tag. It should follow the convention `py<major><minor>`,
|
||||
# for example, Python 3.12 --> py312
|
||||
export python_tag=py312
|
||||
|
||||
# Determine the processor architecture
|
||||
export ARCH=$(uname -m)
|
||||
|
||||
# Set the Python platform name for the Linux case
|
||||
export python_plat_name="manylinux2014_${ARCH}"
|
||||
|
||||
#. Build FBGEMM_GPU for the ROCm platform. Set ``ROCM_PATH`` to the path to your ROCm installation.
|
||||
Run these commands from the ``fbgemm_gpu/`` directory inside the Miniconda environment.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# !! Run in the fbgemm_gpu/ directory inside the Conda environment !!
|
||||
|
||||
export ROCM_PATH=</path/to/rocm>
|
||||
|
||||
# Build for the target architecture of the ROCm device installed on the machine (for example, 'gfx942;gfx90a')
|
||||
# See :doc:`The Linux system requirements <../../reference/system-requirements>` for a list of supported GPUs.
|
||||
export PYTORCH_ROCM_ARCH=$(${ROCM_PATH}/bin/rocminfo | grep -o -m 1 'gfx.*')
|
||||
|
||||
# Build the wheel artifact only
|
||||
python setup.py bdist_wheel \
|
||||
--package_variant=rocm \
|
||||
--python-tag="${python_tag}" \
|
||||
--plat-name="${python_plat_name}" \
|
||||
-DHIP_ROOT_DIR="${ROCM_PATH}" \
|
||||
-DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
|
||||
-DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
|
||||
|
||||
# Build and install the library into the Conda environment
|
||||
python setup.py install \
|
||||
--package_variant=rocm \
|
||||
-DHIP_ROOT_DIR="${ROCM_PATH}" \
|
||||
-DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
|
||||
-DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
|
||||
|
||||
Post-build validation
|
||||
----------------------
|
||||
|
||||
After building FBGEMM_GPU, run some verification checks to ensure the build is correct. Continue
|
||||
to run all commands inside the ``fbgemm_gpu/`` directory inside the Miniconda environment.
|
||||
|
||||
#. The build process generates many build artifacts and C++ templates, so
|
||||
it is important to confirm no undefined symbols remain.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
|
||||
|
||||
# Locate the built .SO file
|
||||
fbgemm_gpu_lib_path=$(find . -name fbgemm_gpu_py.so)
|
||||
|
||||
# Check that the undefined symbols don't include fbgemm_gpu-defined functions
|
||||
nm -gDCu "${fbgemm_gpu_lib_path}" | sort
|
||||
|
||||
#. Verify the referenced version number of ``GLIBCXX`` and the presence of certain function symbols:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
|
||||
|
||||
# Locate the built .SO file
|
||||
fbgemm_gpu_lib_path=$(find . -name fbgemm_gpu_py.so)
|
||||
|
||||
# Note the versions of GLIBCXX referenced by the .SO
|
||||
# The libstdc++.so.6 available on the install target must support these versions
|
||||
objdump -TC "${fbgemm_gpu_lib_path}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat
|
||||
|
||||
# Test for the existence of a given function symbol in the .SO
|
||||
nm -gDC "${fbgemm_gpu_lib_path}" | grep " fbgemm_gpu::merge_pooled_embeddings("
|
||||
nm -gDC "${fbgemm_gpu_lib_path}" | grep " fbgemm_gpu::jagged_2d_to_dense("
|
||||
|
||||
Testing FBGEMM
|
||||
----------------------
|
||||
|
||||
FBGEMM includes tests and benchmarks to validate performance. To run these tests,
|
||||
you must use ROCm 5.7 or a more recent version on the host and container. To run FBGEMM tests,
|
||||
follow these instructions:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# !! Run inside the Conda environment !!
|
||||
|
||||
# From the /fbgemm_gpu/ directory
|
||||
cd test
|
||||
|
||||
export FBGEMM_TEST_WITH_ROCM=1
|
||||
# Enable for debugging failed kernel executions
|
||||
export HIP_LAUNCH_BLOCKING=1
|
||||
|
||||
# Run the test
|
||||
python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning split_table_batched_embeddings_test.py
|
||||
|
||||
To run the FBGEMM_GPU ``uvm`` test, use these commands. These tests only support the AMD MI210 and
|
||||
more recent accelerators.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Run this inside the Conda environment from the /fbgemm_gpu/ directory
|
||||
export HSA_XNACK=1
|
||||
cd test
|
||||
|
||||
python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning ./uvm/uvm_test.py
|
||||
@@ -1,424 +0,0 @@
|
||||
.. meta::
|
||||
:description: How to use model quantization techniques to speed up inference.
|
||||
:keywords: ROCm, LLM, fine-tuning, usage, tutorial, quantization, Quark, GPTQ, transformers, bitsandbytes
|
||||
|
||||
*****************************
|
||||
Model quantization techniques
|
||||
*****************************
|
||||
|
||||
Quantization reduces the model size compared to its native full-precision version, making it easier to fit large models
|
||||
onto accelerators or GPUs with limited memory usage. This section explains how to perform LLM quantization using AMD Quark, GPTQ
|
||||
and bitsandbytes on AMD Instinct hardware.
|
||||
|
||||
.. _quantize-llms-quark:
|
||||
|
||||
AMD Quark
|
||||
=========
|
||||
|
||||
`AMD Quark <https://quark.docs.amd.com/latest/>`_ offers the leading efficient and scalable quantization solution tailored to AMD Instinct GPUs. It supports ``FP8`` and ``INT8`` quantization for activations, weights, and KV cache,
|
||||
including ``FP8`` attention. For very large models, it employs a two-level ``INT4-FP8`` scheme—storing weights in ``INT4`` while computing with ``FP8``—for nearly 4× compression without sacrificing accuracy.
|
||||
Quark scales efficiently across multiple GPUs, efficiently handling ultra-large models like Llama-3.1-405B. Quantized ``FP8`` models like Llama, Mixtral, and Grok-1 are available under the `AMD organization on Hugging Face <https://huggingface.co/collections/amd/quark-quantized-ocp-fp8-models-66db7936d18fcbaf95d4405c>`_, and can be deployed directly via `vLLM <https://github.com/vllm-project/vllm/tree/main/vllm>`_.
|
||||
|
||||
Installing Quark
|
||||
-------------------
|
||||
|
||||
The latest release of Quark can be installed with pip
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
pip install amd-quark
|
||||
|
||||
For detailed installation instructions, refer to the `Quark documentation <https://quark.docs.amd.com/latest/install.html>`_.
|
||||
|
||||
|
||||
Using Quark for quantization
|
||||
-----------------------------
|
||||
|
||||
#. First, load the pre-trained model and its corresponding tokenizer using the Hugging Face ``transformers`` library.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
|
||||
MAX_SEQ_LEN = 512
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||
)
|
||||
model.eval()
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
#. Prepare the calibration DataLoader (static quantization requires calibration data).
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from datasets import load_dataset
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
BATCH_SIZE = 1
|
||||
NUM_CALIBRATION_DATA = 512
|
||||
|
||||
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
|
||||
text_data = dataset["text"][:NUM_CALIBRATION_DATA]
|
||||
|
||||
tokenized_outputs = tokenizer(
|
||||
text_data, return_tensors="pt", padding=True, truncation=True, max_length=MAX_SEQ_LEN
|
||||
)
|
||||
calib_dataloader = DataLoader(
|
||||
tokenized_outputs['input_ids'], batch_size=BATCH_SIZE, drop_last=True
|
||||
)
|
||||
|
||||
#. Define the quantization configuration. See the comments in the following code snippet for descriptions of each configuration option.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from quark.torch.quantization import (Config, QuantizationConfig,
|
||||
FP8E4M3PerTensorSpec)
|
||||
|
||||
# Define fp8/per-tensor/static spec.
|
||||
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
|
||||
is_dynamic=False).to_quantization_spec()
|
||||
|
||||
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
|
||||
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
|
||||
weight=FP8_PER_TENSOR_SPEC)
|
||||
|
||||
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
|
||||
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
|
||||
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
|
||||
kv_cache_quant_config = {name :
|
||||
QuantizationConfig(input_tensors=global_quant_config.input_tensors,
|
||||
weight=global_quant_config.weight,
|
||||
output_tensors=KV_CACHE_SPEC)
|
||||
for name in kv_cache_layer_names_for_llama}
|
||||
layer_quant_config = kv_cache_quant_config.copy()
|
||||
|
||||
EXCLUDE_LAYERS = ["lm_head"]
|
||||
quant_config = Config(
|
||||
global_quant_config=global_quant_config,
|
||||
layer_quant_config=layer_quant_config,
|
||||
kv_cache_quant_config=kv_cache_quant_config,
|
||||
exclude=EXCLUDE_LAYERS)
|
||||
|
||||
#. Quantize the model and export
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import torch
|
||||
from quark.torch import ModelQuantizer, ModelExporter
|
||||
from quark.torch.export import ExporterConfig, JsonExporterConfig
|
||||
|
||||
# Apply quantization.
|
||||
quantizer = ModelQuantizer(quant_config)
|
||||
quant_model = quantizer.quantize_model(model, calib_dataloader)
|
||||
|
||||
# Freeze quantized model to export.
|
||||
freezed_model = quantizer.freeze(model)
|
||||
|
||||
# Define export config.
|
||||
LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
|
||||
export_config = ExporterConfig(json_export_config=JsonExporterConfig())
|
||||
export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
|
||||
|
||||
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor"
|
||||
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
|
||||
with torch.no_grad():
|
||||
exporter.export_safetensors_model(freezed_model,
|
||||
quant_config=quant_config, tokenizer=tokenizer)
|
||||
|
||||
Evaluating the quantized model with vLLM
|
||||
----------------------------------------
|
||||
|
||||
The exported Quark-quantized model can be loaded directly by vLLM for inference. You need to specify the model path and inform vLLM about the quantization method (``quantization='quark'``) and the KV cache data type (``kv_cache_dtype='fp8'``).
|
||||
Use the ``LLM`` interface to load the model:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from vllm import LLM, SamplingParamsinterface
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor",
|
||||
kv_cache_dtype='fp8',quantization='quark')
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}")
|
||||
print(f"Output: {generated_text!r}")
|
||||
print("-" * 60)
|
||||
|
||||
You can also evaluate the quantized model's accuracy on standard benchmarks using the `lm-evaluation-harness <https://github.com/EleutherAI/lm-evaluation-harness>`_. Pass the necessary vLLM arguments to ``lm_eval`` via ``--model_args``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
lm_eval --model vllm \
|
||||
--model_args pretrained=Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor,kv_cache_dtype='fp8',quantization='quark' \
|
||||
--tasks gsm8k
|
||||
|
||||
This provides a standardized way to measure the performance impact of quantization.
|
||||
.. _fine-tune-llms-gptq:
|
||||
|
||||
GPTQ
|
||||
====
|
||||
|
||||
GPTQ is a post-training quantization technique where each row of the weight matrix is quantized independently to find a
|
||||
version of the weights that minimizes error. These weights are quantized to ``int4`` but are restored to ``fp16`` on the
|
||||
fly during inference. This can save your memory usage by a factor of four. A speedup in inference is expected because
|
||||
inference of GPTQ models uses a lower bit width, which takes less time to communicate.
|
||||
|
||||
Before setting up the GPTQ configuration in Transformers, ensure the `AutoGPTQ <https://github.com/AutoGPTQ/AutoGPTQ>`_ library
|
||||
is installed.
|
||||
|
||||
Installing AutoGPTQ
|
||||
-------------------
|
||||
|
||||
The AutoGPTQ library implements the GPTQ algorithm.
|
||||
|
||||
#. Use the following command to install the latest stable release of AutoGPTQ from pip.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# This will install pre-built wheel for a specific ROCm version.
|
||||
|
||||
pip install auto-gptq --no-build-isolation --extra-index-url https://huggingface.github.io/autogptq-index/whl/rocm573/
|
||||
|
||||
Or, install AutoGPTQ from source for the appropriate ROCm version (for example, ROCm 6.1).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Clone the source code.
|
||||
git clone https://github.com/AutoGPTQ/AutoGPTQ.git
|
||||
cd AutoGPTQ
|
||||
|
||||
# Speed up the compilation by specifying PYTORCH_ROCM_ARCH to target device.
|
||||
PYTORCH_ROCM_ARCH=gfx942 ROCM_VERSION=6.1 pip install .
|
||||
|
||||
# Show the package after the installation
|
||||
|
||||
#. Run ``pip show auto-gptq`` to print information for the installed ``auto-gptq`` package. Its output should look like
|
||||
this:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
Name: auto-gptq
|
||||
Version: 0.8.0.dev0+rocm6.1
|
||||
...
|
||||
|
||||
Using GPTQ with AutoGPTQ
|
||||
------------------------
|
||||
|
||||
#. Run the following code snippet.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from transformers import AutoTokenizer, TextGenerationPipeline
|
||||
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
|
||||
base_model_name = "NousResearch/Llama-2-7b-hf"
|
||||
quantized_model_name = "llama-2-7b-hf-gptq"
|
||||
tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=True)
|
||||
examples = [
|
||||
tokenizer(
|
||||
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
|
||||
)
|
||||
]
|
||||
print(examples)
|
||||
|
||||
The resulting examples should be a list of dictionaries whose keys are ``input_ids`` and ``attention_mask``.
|
||||
|
||||
#. Set up the quantization configuration using the following snippet.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
quantize_config = BaseQuantizeConfig(
|
||||
bits=4, # quantize model to 4-bit
|
||||
group_size=128, # it is recommended to set the value to 128
|
||||
desc_act=False,
|
||||
)
|
||||
|
||||
#. Load the non-quantized model using the AutoGPTQ class and run the quantization.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Import auto_gptq class.
|
||||
from auto_gptq import AutoGPTQForCausalLM
|
||||
|
||||
# Load non-quantized model.
|
||||
base_model = AutoGPTQForCausalLM.from_pretrained(base_model_name, quantize_config, device_map = "auto")
|
||||
base_model.quantize(examples)
|
||||
|
||||
# Save quantized model.
|
||||
base_model.save_quantized(quantized_model_name)
|
||||
|
||||
Using GPTQ with Hugging Face Transformers
|
||||
------------------------------------------
|
||||
|
||||
#. To perform a GPTQ quantization using Hugging Face Transformers, you need to create a ``GPTQConfig`` instance and set the
|
||||
number of bits to quantize to, and a dataset to calibrate the weights.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
|
||||
|
||||
base_model_name = " NousResearch/Llama-2-7b-hf"
|
||||
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
|
||||
gptq_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer)
|
||||
|
||||
#. Load a model to quantize using ``AutoModelForCausalLM`` and pass the
|
||||
``gptq_config`` to its ``from_pretained`` method. Set ``device_map=”auto”`` to
|
||||
automatically offload the model to available GPU resources.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
base_model_name,
|
||||
device_map="auto",
|
||||
quantization_config=gptq_config)
|
||||
|
||||
#. Once the model is quantized, you can push the model and tokenizer to Hugging Face Hub for easy share and access.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
quantized_model.push_to_hub("llama-2-7b-hf-gptq")
|
||||
tokenizer.push_to_hub("llama-2-7b-hf-gptq")
|
||||
|
||||
Or, you can save the model locally using the following snippet.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
quantized_model.save_pretrained("llama-2-7b-gptq")
|
||||
tokenizer.save_pretrained("llama-2-7b-gptq")
|
||||
|
||||
ExLlama-v2 support
|
||||
------------------
|
||||
|
||||
ExLlama is a Python/C++/CUDA implementation of the Llama model that is
|
||||
designed for faster inference with 4-bit GPTQ weights. The ExLlama
|
||||
kernel is activated by default when users create a ``GPTQConfig`` object. To
|
||||
boost inference speed even further on Instinct accelerators, use the ExLlama-v2
|
||||
kernels by configuring the ``exllama_config`` parameter as the following.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from transformers import AutoModelForCausalLM, GPTQConfig
|
||||
#pretrained_model_dir = "meta-llama/Llama-2-7b"
|
||||
base_model_name = "NousResearch/Llama-2-7b-hf"
|
||||
gptq_config = GPTQConfig(bits=4, dataset="c4", exllama_config={"version":2})
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
base_model_name,
|
||||
device_map="auto",
|
||||
quantization_config=gptq_config)
|
||||
|
||||
bitsandbytes
|
||||
============
|
||||
|
||||
The `ROCm-aware bitsandbytes <https://github.com/ROCm/bitsandbytes>`_ library is
|
||||
a lightweight Python wrapper around CUDA custom functions, in particular 8-bit optimizer, matrix multiplication, and
|
||||
8-bit and 4-bit quantization functions. The library includes quantization primitives for 8-bit and 4-bit operations
|
||||
through ``bitsandbytes.nn.Linear8bitLt`` and ``bitsandbytes.nn.Linear4bit`` and 8-bit optimizers through the
|
||||
``bitsandbytes.optim`` module. These modules are supported on AMD Instinct accelerators.
|
||||
|
||||
Installing bitsandbytes
|
||||
-----------------------
|
||||
|
||||
#. To install bitsandbytes for ROCm 6.0 (and later), use the following commands.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Clone the github repo
|
||||
git clone --recurse https://github.com/ROCm/bitsandbytes.git
|
||||
cd bitsandbytes
|
||||
git checkout rocm_enabled_multi_backend
|
||||
|
||||
# Install dependencies
|
||||
pip install -r requirements-dev.txt
|
||||
|
||||
# Use -DBNB_ROCM_ARCH to specify target GPU arch
|
||||
cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S .
|
||||
|
||||
# Compile the project
|
||||
make
|
||||
|
||||
# Install
|
||||
python setup.py install
|
||||
|
||||
#. Run ``pip show bitsandbytes`` to show the information about the installed bitsandbytes package. Its output should
|
||||
look like the following.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
Name: bitsandbytes
|
||||
Version: 0.44.0.dev0
|
||||
...
|
||||
|
||||
Using bitsandbytes primitives
|
||||
-----------------------------
|
||||
|
||||
To get started with bitsandbytes primitives, use the following code as reference.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import bitsandbytes as bnb
|
||||
|
||||
# Use Int8 Matrix Multiplication
|
||||
bnb.matmul(..., threshold=6.0)
|
||||
|
||||
# Use bitsandbytes 8-bit Optimizers
|
||||
adam = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995))
|
||||
|
||||
Using bitsandbytes with Hugging Face Transformers
|
||||
-------------------------------------------------
|
||||
|
||||
To load a Transformers model in 4-bit, set ``load_in_4bit=true`` in ``BitsAndBytesConfig``.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
||||
|
||||
base_model_name = "NousResearch/Llama-2-7b-hf"
|
||||
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
|
||||
bnb_model_4bit = AutoModelForCausalLM.from_pretrained(
|
||||
base_model_name,
|
||||
device_map="auto",
|
||||
quantization_config=quantization_config)
|
||||
|
||||
# Check the memory footprint with get_memory_footprint method
|
||||
print(bnb_model_4bit.get_memory_footprint())
|
||||
|
||||
To load a model in 8-bit for inference, use the ``load_in_8bit`` option.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
||||
|
||||
base_model_name = "NousResearch/Llama-2-7b-hf"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
|
||||
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
|
||||
bnb_model_8bit = AutoModelForCausalLM.from_pretrained(
|
||||
base_model_name,
|
||||
device_map="auto",
|
||||
quantization_config=quantization_config)
|
||||
|
||||
prompt = "What is a large language model?"
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
|
||||
generated_ids = model.generate(**inputs)
|
||||
outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
.. meta::
|
||||
:description: How to optimize Triton kernels for ROCm.
|
||||
:keywords: ROCm, LLM, fine-tuning, usage, MI300X, tutorial, Triton, kernel, performance, optimization
|
||||
|
||||
*************************
|
||||
Optimizing Triton kernels
|
||||
*************************
|
||||
|
||||
This section introduces the general steps for
|
||||
`Triton <https://openai.com/index/triton/>`_ kernel optimization. Broadly,
|
||||
Triton kernel optimization is similar to :doc:`HIP <hip:how-to/performance_guidelines>`
|
||||
and CUDA kernel optimization.
|
||||
|
||||
Refer to the
|
||||
:ref:`Triton kernel performance optimization <mi300x-triton-kernel-performance-optimization>`
|
||||
section of the :doc:`workload` guide
|
||||
for detailed information.
|
||||
|
||||
Triton kernel performance optimization includes the following topics.
|
||||
|
||||
* :ref:`mi300x-autotunable-kernel-config`
|
||||
|
||||
* :ref:`mi300x-mlir-analysis`
|
||||
|
||||
* :ref:`mi300x-assembly-analysis`
|
||||
|
||||
* :ref:`mi300x-torchinductor-tuning`
|
||||
|
||||
* :ref:`mi300x-compute-kernel-occ`
|
||||
@@ -1,485 +0,0 @@
|
||||
---
|
||||
myst:
|
||||
html_meta:
|
||||
"description": "How to optimize machine learning workloads with Composable Kernel (CK)."
|
||||
"keywords": "mixed, precision, kernel, inference, linear, algebra, ck, GEMM"
|
||||
---
|
||||
|
||||
# Optimizing with Composable Kernel
|
||||
|
||||
The AMD ROCm Composable Kernel (CK) library provides a programming model for writing performance-critical kernels for machine learning workloads. It generates a general-purpose kernel during the compilation phase through a C++ template, enabling developers to achieve operation fusions on different data precisions.
|
||||
|
||||
This article gives a high-level overview of CK General Matrix Multiplication (GEMM) kernel based on the design example of `03_gemm_bias_relu`. It also outlines the steps to construct the kernel and run it. Moreover, the article provides a detailed implementation of running SmoothQuant quantized INT8 models on AMD Instinct MI300X accelerators using CK.
|
||||
|
||||
## High-level overview: a CK GEMM instance
|
||||
|
||||
GEMM is a fundamental block in linear algebra, machine learning, and deep neural networks. It is defined as the operation:
|
||||
{math}`E = α \times (A \times B) + β \times (D)`, with A and B as matrix inputs, α and β as scalar inputs, and D as a pre-existing matrix.
|
||||
Take the commonly used linear transformation in a fully connected layer as an example. These terms correspond to input activation (A), weight (B), bias (D), and output (E), respectively. The example employs a `DeviceGemmMultipleD_Xdl_CShuffle` struct from CK library as the fundamental instance to explore the compute capability of AMD Instinct accelerators for the computation of GEMM. The implementation of the instance contains two phases:
|
||||
|
||||
- [Template parameter definition](#template-parameter-definition)
|
||||
- [Instantiating and running the templated kernel](#instantiating-and-running-the-templated-kernel)
|
||||
|
||||
### Template parameter definition
|
||||
|
||||
The template parameters of the instance are grouped into four parameter types:
|
||||
|
||||
- [Parameters for determining matrix data precision](matrix-data-precision)
|
||||
- [Parameters for determining matrix data layout](matrix-data-layout)
|
||||
- [Parameters for determining extra operations on matrix elements](matrix-element-operation)
|
||||
- [Performance-oriented tunable parameters](tunable-parameters)
|
||||
|
||||
<!--
|
||||
================
|
||||
### Figure 2
|
||||
================ -->
|
||||
```{figure} ../../../data/how-to/llm-fine-tuning-optimization/ck-template_parameters.jpg
|
||||
The template parameters of the selected GEMM kernel are classified into four groups. These template parameter groups should be defined properly before running the instance.
|
||||
```
|
||||
|
||||
(matrix-data-precision)=
|
||||
|
||||
#### Matrix data precision
|
||||
|
||||
A, B, D, and E are defined as half-precision floating-point datatypes. The multiply-add results of matrix A and B are added with a pre-existing matrix D (half-precision), and the final GEMM results are also half-precision floating-points.
|
||||
|
||||
```c++
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using AccDataType = F32;
|
||||
using CShuffleDataType = F16;
|
||||
using DDataType = F16;
|
||||
using EDataType = F16;
|
||||
```
|
||||
|
||||
`ADataType` and `BDataType` denote the data precision of the A and B input matrices. `AccDataType` determines the data precision used for representing the multiply-add results of A and B elements. These results are stored in a `CShuffle` module in local data share (LDS), a low-latency and high-bandwidth explicitly-addressed memory used for synchronization within a workgroup LDS for later use.
|
||||
|
||||
`CShuffleDataType` denotes the data precision of `CShuffle` in LDS.
|
||||
|
||||
`DDataType` denotes the data precision of the pre-existing D matrix stored in GPU global memory, while `EDatatype` denotes the data precision of the final output. The CK kernel supports a fusion strategy so that `CShuffle` can be added with a single pre-existing matrix in the same GPU kernel for better performance.
|
||||
|
||||
(matrix-data-layout)=
|
||||
|
||||
#### Matrix data layout
|
||||
|
||||
```c++
|
||||
using ALayout = Row;
|
||||
using BLayout = Col;
|
||||
using DLayout = Row;
|
||||
using ELayout = Row;
|
||||
```
|
||||
|
||||
Following the convention of various linear algebra libraries, CK assumes that the input matrix A is an M x K matrix, meaning the matrix has M rows and K columns. Similarly, matrix B is assumed to be K x N, meaning it has K rows and N columns. In computing, row-major order and column-major order are commonly used ways to store matrices in linear storage. After understanding the matrix storage pattern, the underlying optimized memory access manner can be applied to achieve better performance depending on the storage ordering of these matrices.
|
||||
|
||||
(matrix-element-operation)=
|
||||
|
||||
#### Matrix element operation
|
||||
|
||||
```c++
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = AddRelu;
|
||||
```
|
||||
|
||||
CK supports the pre-processing of the matrix before calculating GEMM, that is, `C = AElementOp(A) * BElementOp(B)`. It similarly supports the post-processing of GEMM results the same way, that is, `E = CDEElementOp(C, D)`.
|
||||
|
||||
`AElementOp` and `BElementOp` determine the operation applied to matrix A and B separately before GEMM, which is achieved by binding the operation with a C++ struct function.
|
||||
|
||||
The above `PassThrough` denotes no operations are performed on the target matrix. `CDEELementOp` determines the operations applied to `CShuffle` output and matrix D. The following binding struct `AddRelu` shows an example of adding the `CShuffle` output and matrix D, and ReLU (Rectified Linear Unit) operations to the addition result. It then passes the results to matrix E.
|
||||
|
||||
```c++
|
||||
struct AddRelu
|
||||
{
|
||||
__host__ __device__ void operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& d) const
|
||||
{
|
||||
const ck::half_t x = c + d;
|
||||
e = x > 0 ? x : 0;
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
(tunable-parameters)=
|
||||
|
||||
#### Tunable parameters
|
||||
|
||||
The CK instance includes a series of tunable template parameters to control the parallel granularity of the workload to achieve load balancing on different hardware platforms.
|
||||
|
||||
These parameters include Block Size, M/N/K Per Block, M/N per XDL, AK1, BK1, etc.
|
||||
|
||||
- Block Size determines the number of threads in the thread block.
|
||||
- M/N/K Per Block determines the size of tile that each thread block is responsible for calculating.
|
||||
- M/N Per XDL refers to M/N size for Instinct accelerator Matrix Fused Multiply Add (MFMA) instructions operating on a per-wavefront basis.
|
||||
- A/B K1 is related to the data type. It can be any value ranging from 1 to K Per Block. To achieve the optimal load/store performance, 128bit per load is suggested. In addition, the A/B loading parameters must be changed accordingly to match the A/B K1 value; otherwise, it will result in compilation errors.
|
||||
|
||||
Conditions for achieving computational load balancing on different hardware platforms can vary.
|
||||
|
||||
### Instantiating and running the templated kernel
|
||||
|
||||
After determining the template parameters, we instantiate the kernel with actual arguments. Do one of the following:
|
||||
|
||||
- Use `GetDeviceBuffer` from CK’s custom struct `DeviceMem` to pass the element values of the matrices that need to be calculated.
|
||||
- Allocate device buffer via `hipMalloc`. Ensure the device buffer size can fit the matrix size.
|
||||
- Pass matrix elements through the `data_ptr` method in the `Tensor` object if the matrix to be calculated is of `Tensor` type.
|
||||
|
||||
The row and column, and stride information of input matrices are also passed to the instance. For batched GEMM, you must pass in additional batch count and batch stride values. The extra operations for pre and post-processing are also passed with an actual argument; for example, α and β for GEMM scaling operations. Afterward, the instantiated kernel is launched by the invoker, as illustrated in Figure 3.
|
||||
|
||||
<!--
|
||||
================
|
||||
### Figure 3
|
||||
================ -->
|
||||
```{figure} ../../../data/how-to/llm-fine-tuning-optimization/ck-kernel_launch.jpg
|
||||
Templated kernel launching consists of kernel instantiation, making arguments by passing in actual application parameters, creating an invoker, and running the instance through the invoker.
|
||||
```
|
||||
|
||||
## Developing fused INT8 kernels for SmoothQuant models
|
||||
|
||||
[SmoothQuant](https://github.com/mit-han-lab/smoothquant) (SQ) is a quantization algorithm that enables an INT8 quantization of both weights and activations for all the matrix multiplications in LLM. The required GPU kernel functionalities used to accelerate the inference of SQ models on Instinct accelerators are shown in the following table.
|
||||
|
||||
:::{table} Functionalities used to implement SmoothQuant model inference.
|
||||
|
||||
| Functionality descriptions | Corresponding wrappers |
|
||||
|:-------------------------------------|-----------------------------------------|
|
||||
| {math}`E = α \times (A \times B) + β \times (D)`, where A, B, D, E are INT8 2-D tensors; | E = Linear_ABDE_I8(A, B, D, {math}`\alpha`, {math}`\beta`) |
|
||||
| {math}`E = RELU (α \times (A \times B) + β \times (D))`, where A, B, D, E are INT8 2-D tensors; | E = Linear_ReLU_ABDE_I8(A, B, D, {math}`\alpha`, {math}`\beta`) |
|
||||
| {math}`E = α \times (A \times B) + β \times (D)`, where A, B are INT8 2-D tensors, D and E are FP32 2-D tensors; | E = Linear_AB_I8_DE_F32(A, B, D, {math}`\alpha`, {math}`\beta`) |
|
||||
| {math}`E = α \times (A \times B)`, where A, B, E are INT8 3-D tensors; | E = BMM_ABE_I8(A, B, {math}`\alpha`) |
|
||||
| {math}`E = α \times (A \times B)`, where A, B are INT8 3-D tensors, E is FP32 3-D tensor; | E = BMM_AB_I8_E_F32(A, B, {math}`\alpha`) |
|
||||
:::
|
||||
|
||||
### Operation flow analysis
|
||||
|
||||
The following section discusses the analysis of the operation flow of `Linear_ReLU_ABDE_I8`. The rest of the wrappers in Table 1 can be analyzed similarly.
|
||||
|
||||
The first operation in the process is to perform the multiplication of input matrices A and B. The resulting matrix C is then scaled with α to obtain T1. At the same time, the process performs a scaling operation on D elements to obtain T2. Afterward, the process performs matrix addition between T1 and T2, element activation calculation using ReLU, and element rounding sequentially. The operations to generate E1, E2, and E are encapsulated and completed by a user-defined template function in CK (given in the next sub-section). This template function is integrated into the fundamental instance directly during the compilation phase so that all these steps can be fused in a single GPU kernel.
|
||||
|
||||
<!--
|
||||
================
|
||||
### Figure 4
|
||||
================ -->
|
||||
```{figure} ../../../data/how-to/llm-fine-tuning-optimization/ck-operation_flow.jpg
|
||||
Operation flow.
|
||||
```
|
||||
|
||||
The CK library contains many fundamental instances that implement different functions. Familiarize yourself with the names of various CK instances and determine whether they meet the target functional requirements.
|
||||
|
||||
Second, consider whether the format of input data meets your actual calculation needs. For SQ models, the 8-bit integer data format (INT8) is applied for matrix calculations.
|
||||
|
||||
Third, consider the platform for implementing CK instances. The instances suffixed with `xdl` only run on AMD Instinct accelerators after being compiled and cannot run on Radeon-series GPUs. This is due to the underlying device-specific instruction sets for implementing these basic instances.
|
||||
|
||||
Here, we use [DeviceBatchedGemmMultiD_Xdl](https://github.com/ROCm/composable_kernel/tree/develop/example/24_batched_gemm) as the fundamental instance to implement the functionalities in the previous table.
|
||||
|
||||
<!--
|
||||
================
|
||||
### Figure 5
|
||||
================ -->
|
||||
```{figure} ../../../data/how-to/llm-fine-tuning-optimization/ck-root_instance.jpg
|
||||
Use the ‘DeviceBatchedGemmMultiD_Xdl’ instance as a root.
|
||||
```
|
||||
|
||||
The `DeviceBatchedGemmMultiD_Xdl` instance realizes the batched GEMM `BMM_ABE_I8` and `BMM_AB_I8_E_F32` kernels directly by using the proper input and output data precision types.
|
||||
|
||||
Based on the two batched GEMM kernels, GEMM kernel `Linear_ABDE_I8` and `Linear_AB_I8_DE_F32` can be implemented by expanding their input 2-D tensors to 3-D tensors. Then, the 3-D output tensors produced by the root instance are squeezed back to 2-D output tensors before returning back.
|
||||
|
||||
For example, unsqueeze A (M, K) to A (1, M, K) before assigning it into the root instance and squeeze E (1, M, N) to (M, N) after the calculations of the root instance return back. `Linear_ReLU_ABDE_I8` is implemented by adding a ReLU operation on the result output of `Linear_ABDE_I8`.
|
||||
|
||||
### Developing the complete function
|
||||
|
||||
The inference of SQ quantized models relies on using PyTorch and Transformer libraries, and a tensor type is used to represent matrices and vectors in `torch`, the C++ data types in CK need to be replaced with the `torch::tensor` type. The data types of the input and output matrices should be a `tensor` type.
|
||||
|
||||
In GEMM, the A and B inputs are two-dimensional matrices, and the required input matrices of the selected fundamental CK instance are three-dimensional matrices. Therefore, we must convert the input 2-D tensors to 3-D tensors, by using `tensor`'s `unsqueeze()` method before passing these matrices to the instance. For batched GEMM in the preceding table, ignore this step.
|
||||
|
||||
```c++
|
||||
// Function input and output
|
||||
torch::Tensor linear_relu_abde_i8(
|
||||
torch::Tensor A_,
|
||||
torch::Tensor B_,
|
||||
torch::Tensor D_,
|
||||
float alpha,
|
||||
float beta)
|
||||
{
|
||||
// Convert torch::Tensor A_ (M, K) to torch::Tensor A (1, M, K)
|
||||
auto A = A_.unsqueeze(0);
|
||||
|
||||
// Convert torch::Tensor B_ (K, N) to torch::Tensor A (1, K, N)
|
||||
auto B = B_.unsqueeze(0);
|
||||
...
|
||||
```
|
||||
|
||||
As shown in the following code block, we obtain M, N, and K values using input tensor size values. This stride size information is used to reshape the input vector D and allocate the storage space of tensor E. Stride reflects the exact size of continuous elements in memory, which are passed as important parameters to the fundamental instance for GPU kernel use.
|
||||
|
||||
```c++
|
||||
// Return the batch count from the size of dimension 0
|
||||
int batch_count = A.size(0);
|
||||
|
||||
// Return the M, N, K from the size of dimension 1 & 2
|
||||
int M = A.size(1);
|
||||
int N = B.size(1);
|
||||
int K = A.size(2);
|
||||
|
||||
// Initialize the stride size for A, B, D and E
|
||||
int stride_A = K;
|
||||
int stride_B = K;
|
||||
int stride_D0 = N;
|
||||
int stride_E = N;
|
||||
|
||||
// Initialize the stride size for batched A, B, D and E
|
||||
long long int batch_stride_A = M * K;
|
||||
long long int batch_stride_B = K * N;
|
||||
long long int batch_stride_D0 = M * N;
|
||||
long long int batch_stride_E = M * N;
|
||||
|
||||
// Convert the tensor of 2-D to 3-D
|
||||
auto D = D_.view({1,-1}).repeat({M, 1});
|
||||
|
||||
// Allocate memory for E
|
||||
auto E = torch::empty({batch_count, M, N},
|
||||
torch::dtype(torch::kInt8).device(A.device()));
|
||||
```
|
||||
|
||||
In the following code block, `ADataType`, `BDataType` and `D0DataType` are used to denote the data precision of the input tensors A, B and D, respectively. `EDataType` is used to denote the data precision of output tensor E. These parameters are specified to `I8` data format (8-bit integer data format) to meet the kernel's design requirements.
|
||||
|
||||
`AccDataType` determines the data precision used to represent the multiply-add results of A and B elements. Generally, a larger range data type is applied to store the multiply-add results of A and B to avoid result overflow; `I32` is applied in this case. The `CShuffleDataType I32` data type indicates that the multiply-add results continue to be stored in LDS as an `I32` data format. All of this is implemented through the following code block.
|
||||
|
||||
```c++
|
||||
// Data precision
|
||||
using ADataType = I8;
|
||||
using BDataType = I8;
|
||||
using AccDataType = I32;
|
||||
using CShuffleDataType = I32;
|
||||
using D0DataType = I8;
|
||||
using DsDataType = ck::Tuple<D0DataType>;
|
||||
using EDataType = I8;
|
||||
```
|
||||
|
||||
Following the convention of various linear algebra libraries, row-major and column-major orders are used to denote the ways of storing matrices in linear storage. The advantage of specifying matrix B as column major is that all the relevant matrix elements are stored continuously in GPU global memory when a row in A is multiplied by a column in B, which can help GPU achieve data consistency access to improve access performance.
|
||||
|
||||
```c++
|
||||
// Specify tensor order
|
||||
using ALayout = RowMajor;
|
||||
using BLayout = ColumnMajor;
|
||||
using D0Layout = RowMajor;
|
||||
using DsLayout = ck::Tuple<D0Layout>;
|
||||
using ELayout = RowMajor;
|
||||
```
|
||||
|
||||
In CK, `PassThrough` is a struct denoting if an operation is applied to the tensor it binds to. To fuse the operations between E1, E2, and E introduced in section [Operation flow analysis](#operation-flow-analysis), we define a custom C++ struct, `ScaleScaleAddRelu`, and bind it to `CDEELementOp`. It determines the operations that will be applied to `CShuffle` (A×B results), tensor D, α, and β.
|
||||
|
||||
```c++
|
||||
// No operations bound to the elements of A and B
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
|
||||
// Operations bound to the elements of C, D and E
|
||||
using CDEElementOp = ScaleScaleAddRelu;
|
||||
```
|
||||
|
||||
In the binding struct, `operator()` performs an addition operation between `CShuffle` and matrix D, a ReLU operation on the addition results, and a rounding operation on the output elements. It then returns the results to E.
|
||||
|
||||
```c++
|
||||
struct ScaleScaleAddRelu {
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void
|
||||
operator()<I8, I32, I8>(I8& e, const I32& c, const I8& d) const
|
||||
{
|
||||
// Scale AxB result with alpha
|
||||
const F32 c_scale = ck::type_convert<F32>(c) * alpha;
|
||||
|
||||
// Scale D with beta
|
||||
const F32 d_scale = ck::type_convert<F32>(d) * beta;
|
||||
|
||||
// Perform addition operation
|
||||
F32 temp = c_scale + d_scale;
|
||||
|
||||
// Perform RELU operation
|
||||
temp = temp > 0 ? temp : 0;
|
||||
|
||||
// Perform rounding operation
|
||||
temp = temp > 127 ? 127 : temp;
|
||||
|
||||
// Return to E
|
||||
e = ck::type_convert<I8>(temp);
|
||||
}
|
||||
|
||||
F32 alpha;
|
||||
F32 beta;
|
||||
};
|
||||
```
|
||||
|
||||
The original input tensors need to be padded to meet GPU tile-based parallelism.
|
||||
|
||||
```c++
|
||||
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
```
|
||||
|
||||
The template parameters of the target fundamental instance are initialized with the above parameters and includes default tunable parameters. For specific tuning methods, see [Tunable parameters](#tunable-parameters).
|
||||
|
||||
```c++
|
||||
using DeviceOpInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl<
|
||||
// Tensor layout
|
||||
ALayout, BLayout, DsLayout, ELayout,
|
||||
// Tensor data type
|
||||
ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,
|
||||
// Tensor operation
|
||||
AElementOp, BElementOp, CDEElementOp,
|
||||
// Padding strategy
|
||||
GemmDefault,
|
||||
// Tunable parameters
|
||||
tunable parameters>;
|
||||
```
|
||||
|
||||
Return the address of the first element of tensors:
|
||||
|
||||
```c++
|
||||
auto A_ref = A.data_ptr<ADataType>();
|
||||
auto B_ref = B.data_ptr<BDataType>();
|
||||
auto D0_ref = D.data_ptr<D0DataType>();
|
||||
auto E_ref = E.data_ptr<EDataType>();
|
||||
```
|
||||
|
||||
The fundamental instance is then initialized and run with actual arguments:
|
||||
|
||||
```c++
|
||||
auto device_op = DeviceOpInstance{};
|
||||
auto invoker = device_op.MakeInvoker();
|
||||
auto argument = device_op.MakeArgument(
|
||||
A_ref, B_ref, {D0_ref}, E_ref,
|
||||
M, N, K,
|
||||
batch_count,
|
||||
stride_A, stride_B, {stride_D0}, stride_E,
|
||||
batch_stride_A, batch_stride_B, {batch_stride_D0}, batch_stride_E,
|
||||
AElementOp{}, BElementOp{}, CDEElementOp{alpha, beta});
|
||||
|
||||
invoker.Run(argument, StreamConfig{nullptr, 0});
|
||||
```
|
||||
|
||||
The output of the fundamental instance is a calculated batched matrix E (batch, M, N). Before the return, it needs to be converted to a 2-D matrix if a normal GEMM result is required.
|
||||
|
||||
```c++
|
||||
// Convert (1, M, N) to (M, N)
|
||||
return E.squeeze(0);
|
||||
```
|
||||
|
||||
### Binding to Python
|
||||
|
||||
Since these functions are written in C++ and `torch::Tensor`, you can use `pybind11` to bind the functions and import them as Python modules. For the example, the necessary binding code for exposing the functions in the table spans but a few lines.
|
||||
|
||||
```c++
|
||||
#include <torch/extension.h>
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
|
||||
m.def("linear_ab_i8_de_f32", &linear_ab_i8_de_f32);
|
||||
m.def("linear_relu_abde_i8", &linear_relu_abde_i8);
|
||||
m.def("linear_abde_i8", &linear_abde_i8);
|
||||
m.def("bmm_abe_i8", &bmm_abe_i8);
|
||||
m.def("bmm_ab_i8_e_f32", &bmm_ab_i8_e_f32);
|
||||
}
|
||||
```
|
||||
|
||||
Build the C++ extension by writing a `setup.py` script that uses `setuptools` to compile the C++ code. A reference implementation of the `setup.py` script is as follows.
|
||||
|
||||
```python
|
||||
import os
|
||||
from setuptools import setup, find_packages
|
||||
from torch.utils import cpp_extension
|
||||
from torch.utils.cpp_extension import BuildExtension
|
||||
|
||||
os.environ["CC"] = "hipcc"
|
||||
os.environ["CXX"] = "hipcc"
|
||||
|
||||
sources = [
|
||||
'torch_int/kernels/linear.cpp',
|
||||
'torch_int/kernels/bmm.cpp',
|
||||
'torch_int/kernels/pybind.cpp',
|
||||
]
|
||||
|
||||
include_dirs = ['torch_int/kernels/include']
|
||||
extra_link_args = ['libutility.a']
|
||||
extra_compile_args = ['-O3','-DNDEBUG', '-std=c++17', '--offload-arch=gfx942', '-DCK_ENABLE_INT8', '-D__HIP_PLATFORM_AMD__=1']
|
||||
|
||||
setup(
|
||||
name='torch_int',
|
||||
ext_modules=[
|
||||
cpp_extension.CUDAExtension(
|
||||
name='torch_int.rocm',
|
||||
sources=sources,
|
||||
include_dirs=include_dirs,
|
||||
extra_link_args=extra_link_args,
|
||||
extra_compile_args=extra_compile_args
|
||||
),
|
||||
],
|
||||
cmdclass={
|
||||
'build_ext': BuildExtension.with_options(use_ninja=False)
|
||||
},
|
||||
packages=find_packages(
|
||||
exclude=['notebook', 'scripts', 'tests']),
|
||||
)
|
||||
```
|
||||
|
||||
Run `python setup.py install` to build and install the extension. It should look something like Figure 6:
|
||||
|
||||
<!--
|
||||
================
|
||||
### Figure 6
|
||||
================ -->
|
||||
```{figure} ../../../data/how-to/llm-fine-tuning-optimization/ck-compilation.jpg
|
||||
Compilation and installation of the INT8 kernels.
|
||||
```
|
||||
|
||||
### INT8 model inference and performance
|
||||
|
||||
The implementation architecture of running SmoothQuant models on MI300X GPUs is illustrated in Figure 7, where (a) shows the decoder layer composition components of the target model, (b) shows the major implementation class for the decoder layer components, and \(c\) denotes the underlying GPU kernels implemented by CK instance.
|
||||
|
||||
<!--
|
||||
================
|
||||
### Figure 7
|
||||
================ -->
|
||||
```{figure} ../../../data/how-to/llm-fine-tuning-optimization/ck-inference_flow.jpg
|
||||
The implementation architecture of running SmoothQuant models on AMD MI300X accelerators.
|
||||
```
|
||||
|
||||
For the target [SQ quantized model](https://huggingface.co/mit-han-lab/opt-13b-smoothquant), each decoder layer contains three major components: attention calculation, layer normalization, and linear transformation in fully connected layers. The corresponding implementation classes for these components are:
|
||||
|
||||
- `Int8OPTAttention`
|
||||
- `W8A8B8O8LinearReLU`
|
||||
- `W8A8BF32OF32Linear`
|
||||
|
||||
These classes' underlying implementation logits will harness the functions in previous table. Note that for the example, the `LayerNormQ` module is implemented by the torch native module.
|
||||
|
||||
Testing environment:
|
||||
The hardware platform used for testing equips with 256 AMD EPYC 9534 64-Core Processor, 8 AMD Instinct MI300X accelerators and 1.5T memory. The testing was done in a publicly available Docker image from Docker Hub:
|
||||
[`rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2`](https://hub.docker.com/layers/rocm/pytorch/rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2/images/sha256-f6ea7cee8aae299c7f6368187df7beed29928850c3929c81e6f24b34271d652b)
|
||||
|
||||
The tested models are OPT-1.3B, 2.7B, 6.7B and 13B FP16 models and the corresponding SmoothQuant INT8 OPT models were obtained from Hugging Face.
|
||||
|
||||
Note that since the default values were used for the tunable parameters of the fundamental instance, the performance of the INT8 kernel is suboptimal.
|
||||
|
||||
Figure 8 shows the performance comparisons between the original FP16 and the SmoothQuant-quantized INT8 models on a single MI300X accelerator. The GPU memory footprints of SmoothQuant-quantized models are significantly reduced. It also indicates the per-sample inference latency is significantly reduced for all SmoothQuant-quantized OPT models (illustrated in (b)). Notably, the performance of the CK instance-based INT8 kernel steadily improves with an increase in model size.
|
||||
|
||||
<!--
|
||||
================
|
||||
### Figure 8
|
||||
================ -->
|
||||
```{figure} ../../../data/how-to/llm-fine-tuning-optimization/ck-comparisons.jpg
|
||||
Performance comparisons between the original FP16 and the SmoothQuant-quantized INT8 models on a single MI300X accelerator.
|
||||
```
|
||||
|
||||
For accuracy comparisons between the original FP16 and INT8 models, the evaluation is done by using the first 1,000 samples from the LAMBADA dataset's validation set. We employ the same Last Token Prediction Accuracy method introduced in [SmoothQuant Real-INT8 Inference for PyTorch](https://github.com/mit-han-lab/smoothquant/blob/main/examples/smoothquant_opt_real_int8_demo.ipynb) as our evaluation metric. The comparison results are shown in Table 2.
|
||||
|
||||
:::{table} The inference accuracy comparisons of SmoothQuant quantized models on Instinct MI300X.
|
||||
|
||||
| Models | Hugging Face FP16 model accuracy | SmoothQuant quantized INT8 model accuracy |
|
||||
|:-----------------|----------------------------------------|---------------------------------------------|
|
||||
| opt-1.3B | 0.72 | 0.70 |
|
||||
| opt-2.7B | 0.76 | 0.75 |
|
||||
| opt-6.7B | 0.80 | 0.79 |
|
||||
| opt-13B | 0.79 | 0.77 |
|
||||
:::
|
||||
|
||||
## Conclusion
|
||||
|
||||
CK provides a rich set of template parameters for generating flexible accelerated computing kernels for difference application scenarios.
|
||||
|
||||
CK supports multiple instruction sets of AMD Instinct GPUs, operator fusion and different data precisions. Its composability helps users quickly construct operator performance verification.
|
||||
|
||||
With CK, you can build more effective AI applications with higher flexibility and better performance on different AMD accelerator platforms.
|
||||
@@ -1,29 +0,0 @@
|
||||
.. meta::
|
||||
:description: How to use ROCm profiling and debugging tools.
|
||||
:keywords: ROCm, LLM, fine-tuning, usage, MI300X, tutorial, profiling, debugging, performance, Triton
|
||||
|
||||
***********************
|
||||
Profiling and debugging
|
||||
***********************
|
||||
|
||||
This section provides an index for further documentation on profiling and
|
||||
debugging tools and their common usage patterns.
|
||||
|
||||
See :ref:`AMD Instinct MI300X™ workload optimization <mi300x-profiling-start>`
|
||||
for a conceptual summary of the workload profiling workflow for ROCm applications
|
||||
on AMD hardware -- including fine-tuning LLMs.
|
||||
|
||||
There, you'll find information on higher-level and kernel-level profiling tools
|
||||
as well as other profiling and debugging suggestions.
|
||||
|
||||
* :ref:`PyTorch Profiler <mi300x-pytorch-profiler>`
|
||||
|
||||
* :ref:`ROCm profiling tools <mi300x-profiling-tools>`
|
||||
|
||||
* :ref:`ROCProfiler <mi300x-rocprof>`
|
||||
|
||||
* :ref:`ROCm Compute Profiler <mi300x-rocprof-compute>`
|
||||
|
||||
* :ref:`ROCm Systems Profiler <mi300x-rocprof-systems>`
|
||||
|
||||
* :ref:`ROCr Debug Agent <mi300x-rocr-debug-agent>`
|
||||
@@ -1,25 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
****************************************************
|
||||
SGLang inference performance testing version history
|
||||
****************************************************
|
||||
|
||||
This table lists previous versions of the ROCm SGLang inference performance
|
||||
testing environment. For detailed information about available models for
|
||||
benchmarking, see the version-specific documentation.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Docker image tag
|
||||
- Components
|
||||
- Resources
|
||||
|
||||
* - ``lmsysorg/sglang:v0.4.5-rocm630``
|
||||
-
|
||||
* ROCm 6.3.0
|
||||
* SGLang 0.4.5
|
||||
* PyTorch 2.6.0
|
||||
-
|
||||
* :doc:`Documentation <../sglang>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951>`__
|
||||
@@ -1,445 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
|
||||
ROCm vLLM Docker image.
|
||||
:keywords: model, MAD, automation, dashboarding, validate
|
||||
|
||||
**********************************
|
||||
vLLM inference performance testing
|
||||
**********************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm vLLM
|
||||
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||
|
||||
.. _vllm-benchmark-unified-docker-812:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
|
||||
|
||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||
|
||||
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
|
||||
a prebuilt, optimized environment for validating large language model (LLM)
|
||||
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
|
||||
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
||||
accelerators and includes the following components:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
* - `ROCm <https://github.com/ROCm/ROCm>`__
|
||||
- {{ unified_docker.rocm_version }}
|
||||
|
||||
* - `vLLM <https://docs.vllm.ai/en/latest>`__
|
||||
- {{ unified_docker.vllm_version }}
|
||||
|
||||
* - `PyTorch <https://github.com/ROCm/pytorch>`__
|
||||
- {{ unified_docker.pytorch_version }}
|
||||
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||
- {{ unified_docker.hipblaslt_version }}
|
||||
|
||||
With this Docker image, you can quickly test the :ref:`expected
|
||||
inference performance numbers <vllm-benchmark-performance-measurements-812>` for
|
||||
MI300X series accelerators.
|
||||
|
||||
What's new
|
||||
==========
|
||||
|
||||
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
|
||||
|
||||
* Upgraded to vLLM v0.10.
|
||||
|
||||
* FP8 KV cache support via AITER.
|
||||
|
||||
* Full graph capture support via AITER.
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
|
||||
|
||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||
|
||||
.. _vllm-benchmark-available-models-812:
|
||||
|
||||
The following models are supported for inference performance benchmarking
|
||||
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||
documentation might vary by model -- select one to get started.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row">
|
||||
<div class="col-2 me-2 model-param-head">Model group</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mt-1">
|
||||
<div class="col-2 me-2 model-param-head">Model</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. _vllm-benchmark-vllm-812:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
.. note::
|
||||
|
||||
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. note::
|
||||
|
||||
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||
more information.
|
||||
|
||||
.. _vllm-benchmark-performance-measurements-812:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
|
||||
To evaluate performance, the
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||
page provides reference throughput and serving measurements for inferencing popular AI models.
|
||||
|
||||
.. important::
|
||||
|
||||
The performance data presented in
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||
only reflects the latest version of this inference benchmarking environment.
|
||||
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
|
||||
|
||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ unified_docker.pull_tag }}
|
||||
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
Once the setup is complete, choose between two options to reproduce the
|
||||
benchmark results:
|
||||
|
||||
.. _vllm-benchmark-mad-812:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
||||
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{model.mad_tag}} \
|
||||
--keep-model-dir \
|
||||
--live-output \
|
||||
--timeout 28800
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
|
||||
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
|
||||
and ``{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
Although the :ref:`available models
|
||||
<vllm-benchmark-available-models-812>` are preconfigured to collect
|
||||
offline throughput and online serving performance data, you can
|
||||
also change the benchmarking parameters. See the standalone
|
||||
benchmarking tab for more information.
|
||||
|
||||
{% if model.tunableop %}
|
||||
|
||||
.. note::
|
||||
|
||||
For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
|
||||
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
||||
operators to find the fastest one for your hardware.
|
||||
|
||||
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
|
||||
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
|
||||
the ``--tunableop on`` argument in your run.
|
||||
|
||||
Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
|
||||
performance-collection run.
|
||||
|
||||
{% endif %}
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
.. rubric:: Download the Docker image and required scripts
|
||||
|
||||
1. Run the vLLM benchmark tool independently by starting the
|
||||
`Docker container <{{ unified_docker.docker_hub_url }}>`_
|
||||
as shown in the following snippet.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ unified_docker.pull_tag }}
|
||||
docker run -it \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--shm-size 16G \
|
||||
--security-opt seccomp=unconfined \
|
||||
--security-opt apparmor=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
-v $(pwd):/workspace \
|
||||
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
||||
--name test \
|
||||
{{ unified_docker.pull_tag }}
|
||||
|
||||
2. In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/vllm
|
||||
|
||||
3. To start the benchmark, use the following command with the appropriate options.
|
||||
|
||||
.. code-block::
|
||||
|
||||
./run.sh \
|
||||
--config $CONFIG_CSV \
|
||||
--model_repo {{ model.model_repo }} \
|
||||
<overrides>
|
||||
|
||||
.. dropdown:: Benchmark options
|
||||
:open:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:align: center
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
* - ``--config``
|
||||
- ``configs/default.csv``
|
||||
- Run configs from the CSV for the chosen model repo and benchmark.
|
||||
|
||||
* -
|
||||
- ``configs/extended.csv``
|
||||
-
|
||||
|
||||
* -
|
||||
- ``configs/performance.csv``
|
||||
-
|
||||
|
||||
* - ``--benchmark``
|
||||
- ``throughput``
|
||||
- Measure offline end-to-end throughput.
|
||||
|
||||
* -
|
||||
- ``serving``
|
||||
- Measure online serving performance.
|
||||
|
||||
* -
|
||||
- ``all``
|
||||
- Measure both throughput and serving.
|
||||
|
||||
* - `<overrides>`
|
||||
- See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
|
||||
- Additional overrides to the config CSV.
|
||||
|
||||
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||
already configured. You don't need to specify them with this script.
|
||||
|
||||
.. note::
|
||||
|
||||
For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
|
||||
|
||||
If you encounter the following error, pass your access-authorized Hugging
|
||||
Face token to the gated models.
|
||||
|
||||
.. code-block::
|
||||
|
||||
OSError: You are trying to access a gated repo.
|
||||
|
||||
# pass your HF_TOKEN
|
||||
export HF_TOKEN=$your_personal_hf_token
|
||||
|
||||
.. rubric:: Benchmarking examples
|
||||
|
||||
Here are some examples of running the benchmark with various options:
|
||||
|
||||
* Throughput benchmark
|
||||
|
||||
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_MODEL_NAME={{ model.mad_tag }}
|
||||
./run.sh \
|
||||
--config configs/default.csv \
|
||||
--model_repo {{model.model_repo}} \
|
||||
--benchmark throughput
|
||||
|
||||
Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
|
||||
|
||||
* Serving benchmark
|
||||
|
||||
Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||
|
||||
.. code-block::
|
||||
|
||||
export MAD_MODEL_NAME={{ model.mad_tag }}
|
||||
./run.sh \
|
||||
--config configs/default.csv \
|
||||
--model_repo {{model.model_repo}} \
|
||||
--benchmark serving
|
||||
|
||||
Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<style>
|
||||
mjx-container[jax="CHTML"][display="true"] {
|
||||
text-align: left;
|
||||
margin: 0;
|
||||
}
|
||||
</style>
|
||||
|
||||
.. note::
|
||||
|
||||
Throughput is calculated as:
|
||||
|
||||
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||
|
||||
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Advanced usage
|
||||
==============
|
||||
|
||||
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||
see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
|
||||
|
||||
Reproducing the Docker image
|
||||
----------------------------
|
||||
|
||||
To reproduce this ROCm/vLLM Docker image release, follow these steps:
|
||||
|
||||
1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/vllm.git
|
||||
|
||||
2. Checkout the specific release commit.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cd vllm
|
||||
git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
|
||||
|
||||
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- To learn more about the options for latency and throughput benchmark scripts,
|
||||
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- For application performance optimization strategies for HPC and AI workloads,
|
||||
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||
|
||||
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||
|
||||
- To learn how to fine-tune LLMs and optimize inference, see
|
||||
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`vllm-history` to find documentation for previous releases
|
||||
of the ``ROCm/vllm`` Docker image.
|
||||
@@ -1,346 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
|
||||
ROCm Docker image.
|
||||
:keywords: model, MAD, automation, dashboarding, validate
|
||||
|
||||
**********************************
|
||||
vLLM inference performance testing
|
||||
**********************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm vLLM
|
||||
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||
|
||||
.. _vllm-benchmark-unified-docker:
|
||||
|
||||
The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
|
||||
a prebuilt, optimized environment designed for validating large language model
|
||||
(LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
|
||||
ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
|
||||
MI300X accelerator and includes the following components:
|
||||
|
||||
* `ROCm 6.2.0 <https://github.com/ROCm/ROCm>`_
|
||||
|
||||
* `vLLM 0.4.3 <https://docs.vllm.ai/en/latest>`_
|
||||
|
||||
* `PyTorch 2.4.0 <https://github.com/pytorch/pytorch>`_
|
||||
|
||||
* Tuning files (in CSV format)
|
||||
|
||||
With this Docker image, you can quickly validate the expected inference
|
||||
performance numbers on the MI300X accelerator. This topic also provides tips on
|
||||
optimizing performance with popular AI models.
|
||||
|
||||
.. _vllm-benchmark-vllm:
|
||||
|
||||
.. note::
|
||||
|
||||
vLLM is a toolkit and library for LLM inference and
|
||||
serving. It deploys the PagedAttention algorithm, which reduces memory
|
||||
consumption and increases throughput by leveraging dynamic key and value
|
||||
allocation in GPU memory. vLLM also incorporates many LLM acceleration
|
||||
and quantization algorithms. In addition, AMD implements high-performance
|
||||
custom kernels and modules in vLLM to enhance performance further. See
|
||||
:ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for more
|
||||
information.
|
||||
|
||||
Getting started
|
||||
===============
|
||||
|
||||
Use the following procedures to reproduce the benchmark results on an
|
||||
MI300X accelerator with the prebuilt vLLM Docker image.
|
||||
|
||||
.. _vllm-benchmark-get-started:
|
||||
|
||||
1. Disable NUMA auto-balancing.
|
||||
|
||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||
might hang until the periodic balancing is finalized. For more information,
|
||||
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# disable automatic NUMA balancing
|
||||
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||
cat /proc/sys/kernel/numa_balancing
|
||||
0
|
||||
|
||||
2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
|
||||
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
|
||||
|
||||
Once setup is complete, you can choose between two options to reproduce the
|
||||
benchmark results:
|
||||
|
||||
- :ref:`MAD-integrated benchmarking <vllm-benchmark-mad-v043>`
|
||||
|
||||
- :ref:`Standalone benchmarking <vllm-benchmark-standalone-v043>`
|
||||
|
||||
.. _vllm-benchmark-mad-v043:
|
||||
|
||||
MAD-integrated benchmarking
|
||||
===========================
|
||||
|
||||
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
Use this command to run a performance benchmark test of the Llama 3.1 8B model
|
||||
on one GPU with ``float16`` data type in the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
|
||||
|
||||
ROCm MAD launches a Docker container with the name
|
||||
``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
|
||||
model are collected in the following path: ``~/MAD/reports_float16/``
|
||||
|
||||
Although the following eight models are pre-configured to collect latency and
|
||||
throughput performance data, users can also change the benchmarking parameters.
|
||||
Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone-v043>` section.
|
||||
|
||||
Available models
|
||||
----------------
|
||||
|
||||
.. hlist::
|
||||
:columns: 3
|
||||
|
||||
* ``pyt_vllm_llama-3.1-8b``
|
||||
|
||||
* ``pyt_vllm_llama-3.1-70b``
|
||||
|
||||
* ``pyt_vllm_llama-3.1-405b``
|
||||
|
||||
* ``pyt_vllm_llama-2-7b``
|
||||
|
||||
* ``pyt_vllm_mistral-7b``
|
||||
|
||||
* ``pyt_vllm_qwen2-7b``
|
||||
|
||||
* ``pyt_vllm_jais-13b``
|
||||
|
||||
* ``pyt_vllm_jais-30b``
|
||||
|
||||
.. _vllm-benchmark-standalone-v043:
|
||||
|
||||
Standalone benchmarking
|
||||
=======================
|
||||
|
||||
You can run the vLLM benchmark tool independently by starting the
|
||||
:ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
|
||||
snippet.
|
||||
|
||||
.. code-block::
|
||||
|
||||
docker pull rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
|
||||
docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name unified_docker_vllm rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
|
||||
|
||||
In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||
|
||||
.. code-block::
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/vllm
|
||||
|
||||
Multiprocessing distributed executor
|
||||
--------------------------------------
|
||||
|
||||
To optimize vLLM performance, add the multiprocessing API server argument ``--distributed-executor-backend mp``.
|
||||
|
||||
Command
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
To start the benchmark, use the following command with the appropriate options.
|
||||
See :ref:`Options <vllm-benchmark-standalone-options-v043>` for the list of
|
||||
options and their descriptions.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
|
||||
|
||||
See the :ref:`examples <vllm-benchmark-run-benchmark-v043>` for more information.
|
||||
|
||||
.. note::
|
||||
|
||||
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||
already configured. You don't need to specify them with this script.
|
||||
|
||||
.. note::
|
||||
|
||||
If you encounter the following error, pass your access-authorized Hugging
|
||||
Face token to the gated models.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
OSError: You are trying to access a gated repo.
|
||||
|
||||
# pass your HF_TOKEN
|
||||
export HF_TOKEN=$your_personal_hf_token
|
||||
|
||||
.. _vllm-benchmark-standalone-options-v043:
|
||||
|
||||
Options
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
* - ``$test_option``
|
||||
- latency
|
||||
- Measure decoding token latency
|
||||
|
||||
* -
|
||||
- throughput
|
||||
- Measure token generation throughput
|
||||
|
||||
* -
|
||||
- all
|
||||
- Measure both throughput and latency
|
||||
|
||||
* - ``$model_repo``
|
||||
- ``meta-llama/Meta-Llama-3.1-8B-Instruct``
|
||||
- Llama 3.1 8B
|
||||
|
||||
* - (``float16``)
|
||||
- ``meta-llama/Meta-Llama-3.1-70B-Instruct``
|
||||
- Llama 3.1 70B
|
||||
|
||||
* -
|
||||
- ``meta-llama/Meta-Llama-3.1-405B-Instruct``
|
||||
- Llama 3.1 405B
|
||||
|
||||
* -
|
||||
- ``meta-llama/Llama-2-7b-chat-hf``
|
||||
- Llama 2 7B
|
||||
|
||||
* -
|
||||
- ``mistralai/Mixtral-8x7B-Instruct-v0.1``
|
||||
- Mixtral 8x7B
|
||||
|
||||
* -
|
||||
- ``mistralai/Mixtral-8x22B-Instruct-v0.1``
|
||||
- Mixtral 8x22B
|
||||
|
||||
* -
|
||||
- ``mistralai/Mistral-7B-Instruct-v0.3``
|
||||
- Mixtral 7B
|
||||
|
||||
* -
|
||||
- ``Qwen/Qwen2-7B-Instruct``
|
||||
- Qwen2 7B
|
||||
|
||||
* -
|
||||
- ``core42/jais-13b-chat``
|
||||
- JAIS 13B
|
||||
|
||||
* -
|
||||
- ``core42/jais-30b-chat-v3``
|
||||
- JAIS 30B
|
||||
|
||||
* - ``$num_gpu``
|
||||
- 1 or 8
|
||||
- Number of GPUs
|
||||
|
||||
* - ``$datatype``
|
||||
- ``float16``
|
||||
- Data type
|
||||
|
||||
.. _vllm-benchmark-run-benchmark-v043:
|
||||
|
||||
Running the benchmark on the MI300X accelerator
|
||||
-----------------------------------------------
|
||||
|
||||
Here are some examples of running the benchmark with various options.
|
||||
See :ref:`Options <vllm-benchmark-standalone-options-v043>` for the list of
|
||||
options and their descriptions.
|
||||
|
||||
Latency benchmark example
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Use this command to benchmark the latency of the Llama 3.1 8B model on one GPU with the ``float16`` data type.
|
||||
|
||||
.. code-block::
|
||||
|
||||
./vllm_benchmark_report.sh -s latency -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
|
||||
|
||||
Find the latency report at:
|
||||
|
||||
- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_latency_report.csv``
|
||||
|
||||
Throughput benchmark example
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Use this command to benchmark the throughput of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./vllm_benchmark_report.sh -s throughput -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
|
||||
|
||||
Find the throughput reports at:
|
||||
|
||||
- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_throughput_report.csv``
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<style>
|
||||
mjx-container[jax="CHTML"][display="true"] {
|
||||
text-align: left;
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
</style>
|
||||
|
||||
.. note::
|
||||
|
||||
Throughput is calculated as:
|
||||
|
||||
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||
|
||||
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- For application performance optimization strategies for HPC and AI workloads,
|
||||
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||
|
||||
- To learn more about the options for latency and throughput benchmark scripts,
|
||||
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
||||
|
||||
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||
|
||||
- To learn how to fine-tune LLMs and optimize inference, see
|
||||
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`vllm-history` to find documentation for previous releases
|
||||
of the ``ROCm/vllm`` Docker image.
|
||||
@@ -1,416 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
|
||||
ROCm Docker image.
|
||||
:keywords: model, MAD, automation, dashboarding, validate
|
||||
|
||||
**********************************
|
||||
vLLM inference performance testing
|
||||
**********************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm vLLM
|
||||
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||
|
||||
.. _vllm-benchmark-unified-docker:
|
||||
|
||||
The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
|
||||
a prebuilt, optimized environment designed for validating large language model
|
||||
(LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
|
||||
ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
|
||||
MI300X accelerator and includes the following components:
|
||||
|
||||
* `ROCm 6.2.1 <https://github.com/ROCm/ROCm>`_
|
||||
|
||||
* `vLLM 0.6.4 <https://docs.vllm.ai/en/latest>`_
|
||||
|
||||
* `PyTorch 2.5.0 <https://github.com/pytorch/pytorch>`_
|
||||
|
||||
* Tuning files (in CSV format)
|
||||
|
||||
With this Docker image, you can quickly validate the expected inference
|
||||
performance numbers on the MI300X accelerator. This topic also provides tips on
|
||||
optimizing performance with popular AI models.
|
||||
|
||||
.. hlist::
|
||||
:columns: 6
|
||||
|
||||
* Llama 3.1 8B
|
||||
|
||||
* Llama 3.1 70B
|
||||
|
||||
* Llama 3.1 405B
|
||||
|
||||
* Llama 2 7B
|
||||
|
||||
* Llama 2 70B
|
||||
|
||||
* Mixtral 8x7B
|
||||
|
||||
* Mixtral 8x22B
|
||||
|
||||
* Mixtral 7B
|
||||
|
||||
* Qwen2 7B
|
||||
|
||||
* Qwen2 72B
|
||||
|
||||
* JAIS 13B
|
||||
|
||||
* JAIS 30B
|
||||
|
||||
.. _vllm-benchmark-vllm:
|
||||
|
||||
.. note::
|
||||
|
||||
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||
more information.
|
||||
|
||||
Getting started
|
||||
===============
|
||||
|
||||
Use the following procedures to reproduce the benchmark results on an
|
||||
MI300X accelerator with the prebuilt vLLM Docker image.
|
||||
|
||||
.. _vllm-benchmark-get-started:
|
||||
|
||||
1. Disable NUMA auto-balancing.
|
||||
|
||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||
might hang until the periodic balancing is finalized. For more information,
|
||||
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# disable automatic NUMA balancing
|
||||
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||
cat /proc/sys/kernel/numa_balancing
|
||||
0
|
||||
|
||||
2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
|
||||
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
|
||||
|
||||
Once setup is complete, you can choose between two options to reproduce the
|
||||
benchmark results:
|
||||
|
||||
- :ref:`MAD-integrated benchmarking <vllm-benchmark-mad-v064>`
|
||||
|
||||
- :ref:`Standalone benchmarking <vllm-benchmark-standalone-v064>`
|
||||
|
||||
.. _vllm-benchmark-mad-v064:
|
||||
|
||||
MAD-integrated benchmarking
|
||||
===========================
|
||||
|
||||
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
Use this command to run a performance benchmark test of the Llama 3.1 8B model
|
||||
on one GPU with ``float16`` data type in the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
|
||||
|
||||
ROCm MAD launches a Docker container with the name
|
||||
``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
|
||||
model are collected in the following path: ``~/MAD/reports_float16/``.
|
||||
|
||||
Although the following models are preconfigured to collect latency and
|
||||
throughput performance data, you can also change the benchmarking parameters.
|
||||
Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone-v064>` section.
|
||||
|
||||
Available models
|
||||
----------------
|
||||
|
||||
.. hlist::
|
||||
:columns: 3
|
||||
|
||||
* ``pyt_vllm_llama-3.1-8b``
|
||||
|
||||
* ``pyt_vllm_llama-3.1-70b``
|
||||
|
||||
* ``pyt_vllm_llama-3.1-405b``
|
||||
|
||||
* ``pyt_vllm_llama-2-7b``
|
||||
|
||||
* ``pyt_vllm_llama-2-70b``
|
||||
|
||||
* ``pyt_vllm_mixtral-8x7b``
|
||||
|
||||
* ``pyt_vllm_mixtral-8x22b``
|
||||
|
||||
* ``pyt_vllm_mistral-7b``
|
||||
|
||||
* ``pyt_vllm_qwen2-7b``
|
||||
|
||||
* ``pyt_vllm_qwen2-72b``
|
||||
|
||||
* ``pyt_vllm_jais-13b``
|
||||
|
||||
* ``pyt_vllm_jais-30b``
|
||||
|
||||
* ``pyt_vllm_llama-3.1-8b_fp8``
|
||||
|
||||
* ``pyt_vllm_llama-3.1-70b_fp8``
|
||||
|
||||
* ``pyt_vllm_llama-3.1-405b_fp8``
|
||||
|
||||
* ``pyt_vllm_mixtral-8x7b_fp8``
|
||||
|
||||
* ``pyt_vllm_mixtral-8x22b_fp8``
|
||||
|
||||
.. _vllm-benchmark-standalone-v064:
|
||||
|
||||
Standalone benchmarking
|
||||
=======================
|
||||
|
||||
You can run the vLLM benchmark tool independently by starting the
|
||||
:ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
|
||||
snippet.
|
||||
|
||||
.. code-block::
|
||||
|
||||
docker pull rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
|
||||
docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.4 rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
|
||||
|
||||
In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||
|
||||
.. code-block::
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/vllm
|
||||
|
||||
Command
|
||||
-------
|
||||
|
||||
To start the benchmark, use the following command with the appropriate options.
|
||||
See :ref:`Options <vllm-benchmark-standalone-v064-options>` for the list of
|
||||
options and their descriptions.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
|
||||
|
||||
See the :ref:`examples <vllm-benchmark-run-benchmark-v064>` for more information.
|
||||
|
||||
.. note::
|
||||
|
||||
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||
already configured. You don't need to specify them with this script.
|
||||
|
||||
.. note::
|
||||
|
||||
If you encounter the following error, pass your access-authorized Hugging
|
||||
Face token to the gated models.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
OSError: You are trying to access a gated repo.
|
||||
|
||||
# pass your HF_TOKEN
|
||||
export HF_TOKEN=$your_personal_hf_token
|
||||
|
||||
.. _vllm-benchmark-standalone-v064-options:
|
||||
|
||||
Options
|
||||
-------
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:align: center
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
* - ``$test_option``
|
||||
- latency
|
||||
- Measure decoding token latency
|
||||
|
||||
* -
|
||||
- throughput
|
||||
- Measure token generation throughput
|
||||
|
||||
* -
|
||||
- all
|
||||
- Measure both throughput and latency
|
||||
|
||||
* - ``$model_repo``
|
||||
- ``meta-llama/Meta-Llama-3.1-8B-Instruct``
|
||||
- Llama 3.1 8B
|
||||
|
||||
* - (``float16``)
|
||||
- ``meta-llama/Meta-Llama-3.1-70B-Instruct``
|
||||
- Llama 3.1 70B
|
||||
|
||||
* -
|
||||
- ``meta-llama/Meta-Llama-3.1-405B-Instruct``
|
||||
- Llama 3.1 405B
|
||||
|
||||
* -
|
||||
- ``meta-llama/Llama-2-7b-chat-hf``
|
||||
- Llama 2 7B
|
||||
|
||||
* -
|
||||
- ``meta-llama/Llama-2-70b-chat-hf``
|
||||
- Llama 2 70B
|
||||
|
||||
* -
|
||||
- ``mistralai/Mixtral-8x7B-Instruct-v0.1``
|
||||
- Mixtral 8x7B
|
||||
|
||||
* -
|
||||
- ``mistralai/Mixtral-8x22B-Instruct-v0.1``
|
||||
- Mixtral 8x22B
|
||||
|
||||
* -
|
||||
- ``mistralai/Mistral-7B-Instruct-v0.3``
|
||||
- Mixtral 7B
|
||||
|
||||
* -
|
||||
- ``Qwen/Qwen2-7B-Instruct``
|
||||
- Qwen2 7B
|
||||
|
||||
* -
|
||||
- ``Qwen/Qwen2-72B-Instruct``
|
||||
- Qwen2 72B
|
||||
|
||||
* -
|
||||
- ``core42/jais-13b-chat``
|
||||
- JAIS 13B
|
||||
|
||||
* -
|
||||
- ``core42/jais-30b-chat-v3``
|
||||
- JAIS 30B
|
||||
|
||||
* - ``$model_repo``
|
||||
- ``amd/Meta-Llama-3.1-8B-Instruct-FP8-KV``
|
||||
- Llama 3.1 8B
|
||||
|
||||
* - (``float8``)
|
||||
- ``amd/Meta-Llama-3.1-70B-Instruct-FP8-KV``
|
||||
- Llama 3.1 70B
|
||||
|
||||
* -
|
||||
- ``amd/Meta-Llama-3.1-405B-Instruct-FP8-KV``
|
||||
- Llama 3.1 405B
|
||||
|
||||
* -
|
||||
- ``amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV``
|
||||
- Mixtral 8x7B
|
||||
|
||||
* -
|
||||
- ``amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV``
|
||||
- Mixtral 8x22B
|
||||
|
||||
* - ``$num_gpu``
|
||||
- 1 or 8
|
||||
- Number of GPUs
|
||||
|
||||
* - ``$datatype``
|
||||
- ``float16`` or ``float8``
|
||||
- Data type
|
||||
|
||||
.. _vllm-benchmark-run-benchmark-v064:
|
||||
|
||||
Running the benchmark on the MI300X accelerator
|
||||
-----------------------------------------------
|
||||
|
||||
Here are some examples of running the benchmark with various options.
|
||||
See :ref:`Options <vllm-benchmark-standalone-v064-options>` for the list of
|
||||
options and their descriptions.
|
||||
|
||||
Example 1: latency benchmark
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Use this command to benchmark the latency of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
|
||||
|
||||
.. code-block::
|
||||
|
||||
./vllm_benchmark_report.sh -s latency -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
|
||||
./vllm_benchmark_report.sh -s latency -m amd/Meta-Llama-3.1-8B-Instruct-FP8-KV -g 1 -d float8
|
||||
|
||||
Find the latency reports at:
|
||||
|
||||
- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_latency_report.csv``
|
||||
|
||||
- ``./reports_float8/summary/Meta-Llama-3.1-8B-Instruct-FP8-KV_latency_report.csv``
|
||||
|
||||
Example 2: throughput benchmark
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Use this command to benchmark the throughput of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./vllm_benchmark_report.sh -s throughput -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
|
||||
./vllm_benchmark_report.sh -s throughput -m amd/Meta-Llama-3.1-8B-Instruct-FP8-KV -g 1 -d float8
|
||||
|
||||
Find the throughput reports at:
|
||||
|
||||
- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_throughput_report.csv``
|
||||
|
||||
- ``./reports_float8/summary/Meta-Llama-3.1-8B-Instruct-FP8-KV_throughput_report.csv``
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<style>
|
||||
mjx-container[jax="CHTML"][display="true"] {
|
||||
text-align: left;
|
||||
margin: 0;
|
||||
}
|
||||
</style>
|
||||
|
||||
.. note::
|
||||
|
||||
Throughput is calculated as:
|
||||
|
||||
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||
|
||||
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- For application performance optimization strategies for HPC and AI workloads,
|
||||
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||
|
||||
- To learn more about the options for latency and throughput benchmark scripts,
|
||||
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
||||
|
||||
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||
|
||||
- To learn how to fine-tune LLMs and optimize inference, see
|
||||
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`vllm-history` to find documentation for previous releases
|
||||
of the ``ROCm/vllm`` Docker image.
|
||||