Compare commits

..

4 Commits

Author SHA1 Message Date
Wang, Yanyao
66e8ada46c Test ROCm 6.1.2 build 2024-06-05 14:48:12 -07:00
Wang, Yanyao
673f1eed48 Fix Markdown formate for the linter check 2024-06-05 10:41:25 -07:00
Wang, Yanyao
5642d4f7b0 Update the branch of ROCm repo after testing 2024-06-05 09:50:20 -07:00
Wang, Yanyao
7f93d1635d Build ROCm from source 2024-06-04 21:20:04 -07:00
212 changed files with 11273 additions and 5577 deletions

View File

@@ -17,7 +17,11 @@ resources:
pipelines:
- pipeline: rocr-runtime_pipeline
source: \ROCR-Runtime
trigger: true
trigger:
branches:
include:
- master
# this job will only be triggered after successful build sequence of llvm-project and ROCR-Runtime
trigger: none

View File

@@ -84,10 +84,10 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
extraBuildFlags: >-
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-DCMAKE_BUILD_TYPE=Release
-DGPU_TARGETS=gfx942
-DAMDGPU_TARGETS=gfx1030;gfx1100
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
-DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
-DMIGRAPHX_USE_COMPOSABLEKERNEL=OFF

View File

@@ -16,15 +16,12 @@ parameters:
- libbz2-dev
- nlohmann-json3-dev
- libgtest-dev
- libdrm-dev
- name: rocmDependencies
type: object
default:
- rocMLIR
- rocRAND
- rocBLAS
- hipBLAS
- hipBLASLt
- half
- composable_kernel
- rocm-cmake
@@ -33,14 +30,13 @@ parameters:
- rocprofiler-register
- clr
- rocminfo
- roctracer
jobs:
- job: MIOpen
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ variables.LARGE_DISK_BUILD_POOL }}
pool: ${{ variables.MEDIUM_BUILD_POOL }}
workspace:
clean: all
steps:

View File

@@ -13,7 +13,7 @@ parameters:
- libyaml-cpp-dev
- libpci-dev
- libpci3
- libgtest-dev
- googletest
- git
- name: rocmDependencies
type: object
@@ -35,10 +35,6 @@ jobs:
- template: /.azuredevops/variables-global.yml
- name: HIP_ROCCLR_HOME
value: $(Build.BinariesDirectory)/rocm
- name: ROCM_PATH
value: $(Agent.BuildDirectory)/rocm
- name: HIP_INC_DIR
value: $(Agent.BuildDirectory)/rocm
pool:
vmImage: ${{ variables.BASE_BUILD_POOL }}
workspace:
@@ -63,17 +59,10 @@ jobs:
parameters:
dependencyList: ${{ parameters.rocmDependencies }}
dependencySource: tag-builds
# Set link to redirect llvm folder
- task: Bash@3
displayName: create symlink
inputs:
targetType: inline
script: ln -s $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
extraBuildFlags: >-
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DCPACK_PACKAGING_INSTALL_PREFIX=$(Build.BinariesDirectory)
-GNinja

View File

@@ -12,7 +12,6 @@ parameters:
- ninja-build
- git
- python3-pip
- libdrm-dev
- name: rocmDependencies
type: object
default:
@@ -25,11 +24,10 @@ parameters:
jobs:
- job: composable_kernel
timeoutInMinutes: 100
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ variables.ULTRA_BUILD_POOL }}
pool: ${{ variables.MEDIUM_BUILD_POOL }}
workspace:
clean: all
steps:
@@ -59,6 +57,6 @@ jobs:
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DCMAKE_BUILD_TYPE=Release
-DGPU_TARGETS=gfx942
-DGPU_TARGETS=gfx1030;gfx1100
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -77,6 +77,7 @@ jobs:
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-DAMDGPU_TARGETS=gfx1030;gfx1100
-DHIP_PLATFORM=amd
-DBUILD_CLIENTS_TESTS=ON
-DBUILD_CLIENTS_BENCHMARKS=OFF

View File

@@ -8,13 +8,12 @@ parameters:
- name: aptPackages
type: object
default:
- gfortran
- git
- libdrm-dev
- libmsgpack-dev
- ninja-build
- python3-pip
- python3-venv
- libmsgpack-dev
- git
- python3-pip
- libdrm-dev
- name: pipModules
type: object
default:
@@ -22,16 +21,15 @@ parameters:
- name: rocmDependencies
type: object
default:
- clr
- hipBLAS
- llvm-project
- ROCR-Runtime
- clr
- rocminfo
- rocprofiler-register
- ROCR-Runtime
- hipBLAS
jobs:
- job: hipBLASLt
timeoutInMinutes: 100
variables:
- group: common
- template: /.azuredevops/variables-global.yml
@@ -60,7 +58,7 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
# CI case: download latest default branch build
# CI case: download latest default branch build
- ${{ if eq(parameters.checkoutRef, '') }}:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
@@ -74,42 +72,17 @@ jobs:
dependencySource: tag-builds
- script: sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
displayName: ROCm symbolic link
# Build and install gtest, lapack, hipBLAS-common
# $(Pipeline.Workspace)/deps is a temporary folder for the build process
# $(Pipeline.Workspace)/s/deps is part of the hipBLASLt repo
- script: mkdir $(Pipeline.Workspace)/deps
# hipBLASLt already has a CMake script for external deps, so we can just run that
# https://github.com/ROCm/hipBLASLt/blob/develop/deps/CMakeLists.txt
- script: cmake $(Pipeline.Workspace)/s/deps
displayName: Configure hipBLASLt external dependencies
workingDirectory: $(Pipeline.Workspace)/deps
- script: make
displayName: Build hipBLASLt external dependencies
workingDirectory: $(Pipeline.Workspace)/deps
- script: sudo make install
displayName: Install hipBLASLt external dependencies
workingDirectory: $(Pipeline.Workspace)/deps
# Set link to redirect llvm folder
- task: Bash@3
displayName: Symlink to rocm/lib/llvm
inputs:
targetType: inline
script: ln -s $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
- script: sudo chmod 777 /mnt
displayName: 'Set permissions for /mnt'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
cmakeBuildDir: /mnt/build
cmakeSourceDir: $(Pipeline.Workspace)/s
extraBuildFlags: >-
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-DAMDGPU_TARGETS=gfx942
-DAMDGPU_TARGETS=gfx90a
-DTensile_LOGIC=
-DTensile_CPU_THREADS=
-DTensile_CODE_OBJECT_VERSION=default
-DTensile_LIBRARY_FORMAT=msgpack
-DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -57,6 +57,6 @@ jobs:
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DBUILD_TEST=ON
-DAMDGPU_TARGETS=gfx942
-DAMDGPU_TARGETS=gfx1030;gfx1100
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -66,7 +66,7 @@ jobs:
-DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DCMAKE_BUILD_TYPE=Release
-DAMDGPU_TARGETS=gfx942
-DAMDGPU_TARGETS=gfx1030;gfx1100
-DUSE_HIP_CLANG=ON
-DHIP_COMPILER=clang
-DBUILD_CLIENTS_TESTS=ON

View File

@@ -61,6 +61,6 @@ jobs:
-DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DCMAKE_BUILD_TYPE=Release
-DAMDGPU_TARGETS=gfx942
-DAMDGPU_TARGETS=gfx1030;gfx1100
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -74,6 +74,7 @@ jobs:
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-DAMDGPU_TARGETS=gfx1030;gfx1100
-DBUILD_CLIENTS_TESTS=ON
-DUSE_CUDA=OFF
-GNinja

View File

@@ -65,13 +65,3 @@ jobs:
-DBUILD_CLIENTS_SAMPLES=OFF
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
artifactName: hipSPARSE
publish: false
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
parameters:
sourceDir: $(Build.SourcesDirectory)/build/clients
contentsString: matrices/**
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
artifactName: testMatrices

View File

@@ -75,7 +75,7 @@ jobs:
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-DAMDGPU_TARGETS=gfx942
-DAMDGPU_TARGETS=all
-DTensile_LOGIC=
-DTensile_CPU_THREADS=
-DTensile_CODE_OBJECT_VERSION=default

View File

@@ -55,9 +55,9 @@ jobs:
extraBuildFlags: >-
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/llvm
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
-DROCM_PATH="$(Agent.BuildDirectory)/rocm"
-DCMAKE_BUILD_TYPE=Release
-DHIPTENSOR_BUILD_TESTS=ON
-DAMDGPU_TARGETS=gfx942
multithreadFlag: -- -j32
-DAMDGPU_TARGETS=gfx1030;gfx1100
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -61,7 +61,6 @@ jobs:
parameters:
dependencyList: ${{ parameters.rocmDependencies }}
dependencySource: tag-builds
- script: chmod +x $(Agent.BuildDirectory)/rocm/bin/hipify-perl
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
extraBuildFlags: >-
@@ -72,6 +71,6 @@ jobs:
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
-DBUILD_TESTS=ON
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/share/rocm/cmake/
-DAMDGPU_TARGETS=gfx942
-DAMDGPU_TARGETS=gfx1030;gfx1100
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -1,138 +0,0 @@
parameters:
- name: checkoutRepo
type: string
default: 'self'
- name: checkoutRef
type: string
default: ''
- name: aptPackages
type: object
default:
- python3-pip
- python3-protobuf
- cmake
- ninja-build
- libprotobuf-dev
- libprotoc-dev
- protobuf-compiler
- liblmdb-dev
- pkg-config
- ffmpeg
- libavcodec-dev
- libavformat-dev
- libavutil-dev
- libswscale-dev
- libturbojpeg-dev
- libjpeg-turbo-official=3.0.2-20240124
- libopencv-dev
- name: pipModules
type: object
default:
- numpy
- opencv-python
- torch
- pillow
- name: rocmDependencies
type: object
default:
- rocm-cmake
- llvm-project
- ROCR-Runtime
- clr
- rocDecode
- half
- rpp
- MIVisionX
- aomp
jobs:
- job: rocAL
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool:
vmImage: ${{ variables.BASE_BUILD_POOL }}
workspace:
clean: all
steps:
- task: Bash@3
displayName: 'Register libjpeg-turbo packages'
inputs:
targetType: inline
script: |
sudo mkdir --parents --mode=0755 /etc/apt/keyrings
wget -q -O- https://packagecloud.io/dcommander/libjpeg-turbo/gpgkey | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/libjpeg-turbo.gpg > /dev/null
echo "deb [signed-by=/etc/apt/trusted.gpg.d/libjpeg-turbo.gpg] https://packagecloud.io/dcommander/libjpeg-turbo/any/ any main" | sudo tee /etc/apt/sources.list.d/libjpeg-turbo.list
sudo apt update
apt-cache show libjpeg-turbo-official | grep Version
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
- task: Bash@3
displayName: 'Clone PyBind11'
inputs:
targetType: inline
script: git clone --depth 1 -b v2.11.1 https://github.com/pybind/pybind11
workingDirectory: '$(Build.SourcesDirectory)'
- task: Bash@3
displayName: 'Clone RapidJSON'
inputs:
targetType: inline
script: git clone --depth 1 https://github.com/Tencent/rapidjson.git
workingDirectory: '$(Build.SourcesDirectory)'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
componentName: PyBind11
cmakeBuildDir: '$(Build.SourcesDirectory)/pybind11/build'
customInstallPath: false
installEnabled: false
extraBuildFlags: >-
-DDOWNLOAD_CATCH=ON
-DDOWNLOAD_EIGEN=ON
-GNinja
- task: Bash@3
displayName: 'Install PyBind11'
inputs:
targetType: inline
script: sudo cmake --build . --target install
workingDirectory: '$(Build.SourcesDirectory)/pybind11/build'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
componentName: RapidJSON
cmakeBuildDir: '$(Build.SourcesDirectory)/rapidjson/build'
customInstallPath: false
installEnabled: false
extraBuildFlags: >-
-GNinja
- task: Bash@3
displayName: 'Install RapidJSON'
inputs:
targetType: inline
script: sudo cmake --build . --target install
workingDirectory: '$(Build.SourcesDirectory)/rapidjson/build'
# CI case: download latest default branch build
- ${{ if eq(parameters.checkoutRef, '') }}:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
dependencyList: ${{ parameters.rocmDependencies }}
dependencySource: staging
# manual build case: triggered by ROCm/ROCm repo
- ${{ if ne(parameters.checkoutRef, '') }}:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
dependencyList: ${{ parameters.rocmDependencies }}
dependencySource: tag-builds
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
extraBuildFlags: >-
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;/opt/libjpeg-turbo
-DCMAKE_INSTALL_PREFIX_PYTHON=$Python3_STDARCH
-DCMAKE_BUILD_TYPE=Release
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -67,7 +67,7 @@ jobs:
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/share/rocm/cmake/
-DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-DAMDGPU_TARGETS=gfx942
-DAMDGPU_TARGETS=gfx1030;gfx1100
-DBUILD_CLIENTS_TESTS=ON
-DBUILD_CLIENTS_BENCHMARKS=OFF
-DBUILD_CLIENTS_SAMPLES=OFF

View File

@@ -108,7 +108,7 @@ jobs:
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
-DAMDGPU_TARGETS=gfx942
-DAMDGPU_TARGETS=gfx1030;gfx1100
-DTensile_CODE_OBJECT_VERSION=default
-DTensile_LOGIC=asm_full
-DTensile_SEPARATE_ARCHITECTURES=ON

View File

@@ -64,7 +64,7 @@ jobs:
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DCMAKE_BUILD_TYPE=Release
-DAMDGPU_TARGETS=gfx942
-DAMDGPU_TARGETS=gfx1030;gfx1100
-DUSE_HIP_CLANG=ON
-DHIP_COMPILER=clang
-DBUILD_CLIENTS_TESTS=ON

View File

@@ -10,13 +10,6 @@ parameters:
default:
- cmake
- ninja-build
- git
- python3-pip
- name: rocmDependencies
type: object
default:
- llvm-project
- rocm-cmake
jobs:
- job: rocMLIR
@@ -24,6 +17,8 @@ jobs:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ variables.MEDIUM_BUILD_POOL }}
container:
image: ${{ variables.DOCKER_IMAGE_NAME }}:${{ variables.LATEST_DOCKER_VERSION }}
workspace:
clean: all
steps:
@@ -34,25 +29,13 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
# CI case: download latest default branch build
- ${{ if eq(parameters.checkoutRef, '') }}:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
dependencyList: ${{ parameters.rocmDependencies }}
dependencySource: staging
# manual build case: triggered by ROCm/ROCm repo
- ${{ if ne(parameters.checkoutRef, '') }}:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
dependencyList: ${{ parameters.rocmDependencies }}
dependencySource: tag-builds
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
extraBuildFlags: >-
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/amdclang++
-DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/amdclang
-DCMAKE_PREFIX_PATH=/opt/rocm
-DBUILD_FAT_LIBROCKCOMPILER=1
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -59,7 +59,7 @@ jobs:
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DBUILD_BENCHMARK=ON
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-DAMDGPU_TARGETS=gfx942
-DAMDGPU_TARGETS=gfx1030;gfx1100
-DBUILD_TEST=ON
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -59,6 +59,6 @@ jobs:
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DBUILD_TEST=ON
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-DAMDGPU_TARGETS=gfx942
-DAMDGPU_TARGETS=gfx1030;gfx1100
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -82,7 +82,7 @@ jobs:
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Pipeline.Workspace)/deps-install
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-DAMDGPU_TARGETS=gfx942
-DAMDGPU_TARGETS=gfx1030;gfx1100
-DBUILD_CLIENTS_TESTS=ON
-DBUILD_CLIENTS_BENCHMARKS=OFF
-DBUILD_CLIENTS_SAMPLES=OFF

View File

@@ -68,20 +68,10 @@ jobs:
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
-DCMAKE_BUILD_TYPE=Release
-DAMDGPU_TARGETS=gfx942
-DAMDGPU_TARGETS=gfx1030;gfx1100
-DBUILD_CLIENTS_SAMPLES=OFF
-DBUILD_CLIENTS_TESTS=ON
-DBUILD_CLIENTS_BENCHMARKS=OFF
-DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip;$(Agent.BuildDirectory)/rocm/hip/cmake
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
artifactName: rocSPARSE
publish: false
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
parameters:
sourceDir: $(Build.SourcesDirectory)/build/clients
contentsString: matrices/**
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
artifactName: testMatrices

View File

@@ -60,7 +60,7 @@ jobs:
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DAMDGPU_TARGETS=gfx942
-DAMDGPU_TARGETS=gfx1030;gfx1100
-DBUILD_TEST=ON
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -65,7 +65,7 @@ jobs:
-DCMAKE_BUILD_TYPE=Release
-DROCWMMA_BUILD_TESTS=ON
-DROCWMMA_BUILD_SAMPLES=OFF
-DAMDGPU_TARGETS=gfx942
-DGPU_TARGETS=gfx1100
-GNinja
# gfx1030 not supported in documentation
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -5,30 +5,6 @@ parameters:
- name: checkoutRef
type: string
default: ''
- name: aptPackages
type: object
default:
- libglfw3-dev
- name: rocmDependencies
type: object
default:
- AMDMIGraphX
- clr
- hipBLAS
- hipCUB
- HIPIFY
- hipRAND
- hipSOLVER
- hipSPARSE
- llvm-project
- rocBLAS
- rocPRIM
- rocprofiler-register
- ROCR-Runtime
- rocRAND
- rocSOLVER
- rocSPARSE
- rocThrust
jobs:
- job: rocm_examples
@@ -44,28 +20,5 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
# CI case: download latest default branch build
- ${{ if eq(parameters.checkoutRef, '') }}:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
dependencyList: ${{ parameters.rocmDependencies }}
dependencySource: staging
# manual build case: triggered by ROCm/ROCm repo
- ${{ if ne(parameters.checkoutRef, '') }}:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
dependencyList: ${{ parameters.rocmDependencies }}
dependencySource: tag-builds
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
# https://github.com/ROCm/HIP/issues/2203
extraBuildFlags: >-
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DROCM_ROOT=$(Agent.BuildDirectory)/rocm
-DCMAKE_HIP_ARCHITECTURES=gfx942
-DCMAKE_EXE_LINKER_FLAGS=-fgpu-rdc
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -21,17 +21,4 @@ jobs:
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
componentName: rocprofiler-register
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
componentName: rocprofiler-register-tests
extraBuildFlags: >-
-DCMAKE_PREFIX_PATH=$(Build.BinariesDirectory)
cmakeBuildDir: 'tests/build'
installEnabled: false
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: rocprofiler-register
testDir: 'tests/build'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -47,7 +47,7 @@ jobs:
variables:
- group: common
- template: /.azuredevops/variables-global.yml
- name: HIP_ROCCLR_HOME
- name: HIP_ROCCLR_HOME
value: $(Agent.BuildDirectory)/rocm
- name: ROCM_PATH
value: $(Agent.BuildDirectory)/rocm
@@ -68,7 +68,7 @@ jobs:
displayName: 'Download aqlprofile'
inputs:
targetType: inline
script: wget -nv https://repo.radeon.com/rocm/misc/aqlprofile/ubuntu-22.04/hsa-amd-aqlprofile_1.0.0.60200.60200-crdnnh.14213~22.04_amd64.deb
script: wget -nv https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hsa-amd-aqlprofile/hsa-amd-aqlprofile_1.0.0.60100.60100-82~22.04_amd64.deb
workingDirectory: '$(Pipeline.Workspace)'
- task: Bash@3
displayName: 'Extract aqlprofile'
@@ -76,7 +76,7 @@ jobs:
targetType: inline
script: |
mkdir hsa-amd-aqlprofile
dpkg-deb -R hsa-amd-aqlprofile_1.0.0.60200.60200-crdnnh.14213~22.04_amd64.deb hsa-amd-aqlprofile
dpkg-deb -R hsa-amd-aqlprofile_1.0.0.60100.60100-82~22.04_amd64.deb hsa-amd-aqlprofile
workingDirectory: '$(Pipeline.Workspace)'
- task: Bash@3
displayName: 'Move aqlprofile'
@@ -84,7 +84,7 @@ jobs:
targetType: inline
script: |
mkdir -p $(Agent.BuildDirectory)/rocm
cp -R hsa-amd-aqlprofile/opt/rocm-6.2.0-14213/* $(Agent.BuildDirectory)/rocm
cp -R hsa-amd-aqlprofile/opt/rocm-6.1.0/* $(Agent.BuildDirectory)/rocm
workingDirectory: '$(Pipeline.Workspace)'
# CI case: download latest default branch build
- ${{ if eq(parameters.checkoutRef, '') }}:
@@ -105,5 +105,5 @@ jobs:
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DENABLE_LDCONFIG=OFF
-DUSE_PROF_API=1
-DGPU_TARGETS=gfx942
-DGPU_TARGETS=gfx1030;gfx1100
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -15,13 +15,11 @@ parameters:
- name: rocmDependencies
type: object
default:
- rocm-cmake
- clr
- llvm-project
- ROCdbgapi
- rocminfo
- ROCR-Runtime
- rocprofiler-register
jobs:
- job: rocr_debug_agent
@@ -58,6 +56,5 @@ jobs:
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake;$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -65,6 +65,6 @@ jobs:
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
-DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DGPU_TARGETS=gfx942
-DGPU_TARGETS=gfx1030;gfx1100
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -60,6 +60,6 @@ jobs:
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DHALF_INCLUDE_DIRS=$(Agent.BuildDirectory)/rocm/include
-DCMAKE_BUILD_TYPE=Release
-DAMDGPU_TARGETS=gfx942
-DAMDGPU_TARGETS=gfx1030;gfx1100
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

View File

@@ -1,115 +0,0 @@
parameters:
# currently excludes clr and rocm-examples
- name: rocmDependencies
type: object
default:
- AMDMIGraphX
- amdsmi
- aomp-extras
- aomp
- composable_kernel
- half
- HIP
- hipBLAS
- hipBLASLt
- hipCUB
- hipFFT
- hipfort
- HIPIFY
- hipRAND
- hipSOLVER
- hipSPARSE
- hipSPARSELt
- hipTensor
- llvm-project
- MIOpen
- MIVisionX
- rccl
- rdc
- rocAL
- rocALUTION
- rocBLAS
- ROCdbgapi
- rocDecode
- rocFFT
- ROCgdb
- rocm-cmake
- rocm-core
- rocminfo
- rocMLIR
- ROCmValidationSuite
- rocm_bandwidth_test
- rocm_smi_lib
- rocPRIM
- rocprofiler-register
- rocprofiler
- ROCR-Runtime
- rocRAND
- rocr_debug_agent
- rocSOLVER
- rocSPARSE
- ROCT-Thunk-Interface
- rocThrust
- roctracer
- rocWMMA
- rpp
trigger: none
pr: none
schedules:
- cron: '30 7 * * *'
displayName: Nightly build
branches:
include:
- develop
always: true
jobs:
- job: rocm_nightly
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ variables.MEDIUM_BUILD_POOL }}
workspace:
clean: all
steps:
- task: DeleteFiles@1
displayName: 'Cleanup checkout space'
inputs:
SourceFolder: '$(Agent.BuildDirectory)/s'
Contents: '**/*'
- task: DeleteFiles@1
displayName: 'Cleanup Staging Area'
inputs:
SourceFolder: '$(Build.ArtifactStagingDirectory)'
Contents: '/**/*'
RemoveDotFiles: true
- script: sudo chmod 777 /mnt
displayName: 'Set permissions for /mnt'
- script: df -h
displayName: System disk space before ROCm
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
dependencyList: ${{ parameters.rocmDependencies }}
dependencySource: staging
extractToMnt: true
skipLibraryLinking: true
- script: df -h
displayName: System disk space after ROCm
- script: du -sh /mnt/rocm
displayName: Uncompressed ROCm size
- task: ArchiveFiles@2
displayName: Compress rocm-nightly
inputs:
rootFolderOrFile: /mnt/rocm
includeRootFolder: false
archiveType: tar
tarCompression: gz
archiveFile: $(Build.ArtifactStagingDirectory)/$(Build.DefinitionName)_$(Build.BuildNumber)_ubuntu2204.tar.gz
- script: du -sh $(Build.ArtifactStagingDirectory)
displayName: Compressed ROCm size
- task: PublishPipelineArtifact@1
displayName: 'Public ROCm Nightly Artifact'
retryCountOnTaskFailure: 3
inputs:
targetPath: '$(Build.ArtifactStagingDirectory)'

View File

@@ -1,29 +0,0 @@
variables:
- group: common
- template: /.azuredevops/variables-global.yml
parameters:
- name: checkoutRef
type: string
default: refs/tags/$(LATEST_RELEASE_TAG)
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
- repository: release_repo
type: github
endpoint: ROCm
name: ROCm/rocAL
ref: ${{ parameters.checkoutRef }}
trigger: none
pr: none
jobs:
- template: ${{ variables.CI_COMPONENT_PATH }}/rocAL.yml
parameters:
checkoutRepo: release_repo
checkoutRef: ${{ parameters.checkoutRef }}

View File

@@ -1,29 +0,0 @@
variables:
- group: common
- template: /.azuredevops/variables-global.yml
parameters:
- name: checkoutRef
type: string
default: refs/tags/$(LATEST_RELEASE_TAG)
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
- repository: release_repo
type: github
endpoint: ROCm
name: ROCm/rocm-examples
ref: ${{ parameters.checkoutRef }}
trigger: none
pr: none
jobs:
- template: ${{ variables.CI_COMPONENT_PATH }}/rocm-examples.yml
parameters:
checkoutRepo: release_repo
checkoutRef: ${{ parameters.checkoutRef }}

View File

@@ -9,63 +9,38 @@ parameters:
- name: useDefaultBranch
type: boolean
default: true
- name: extractToMnt
type: boolean
default: false
- name: defaultBranchList
type: object
default:
AMDMIGraphX: develop
amdsmi: develop
aomp-extras: aomp-dev
aomp: aomp-dev
aomp-extras: aomp-dev
AMDMIGraphX: develop
clr: develop
composable_kernel: develop
half: master
HIP: develop
hipBLAS: develop
hipBLASLt: develop
hipCUB: develop
hipFFT: develop
hipfort: develop
HIPIFY: amd-staging
hipRAND: develop
hipSOLVER: develop
hipSPARSE: develop
hipSPARSELt: develop
hipTensor: develop
llvm-project: amd-staging
MIOpen: develop
MIVisionX: develop
rccl: develop
rdc: develop
rocAL: develop
rocALUTION: develop
rocBLAS: develop
ROCdbgapi : amd-master
rocDecode: develop
rocFFT: develop
rocgdb: amd-staging
rocm-cmake: develop
rocm-core: master
rocm-examples: develop
rocminfo: amd-staging
rocMLIR: develop
ROCmValidationSuite: master
rocm_bandwidth_test: master
rocm_smi_lib: develop
rocminfo: master
rocMLIR: develop
rocPRIM: develop
rocprofiler-register: amd-mainline
rocprofiler: amd-master
ROCR-Runtime: master
rocRAND: develop
rocr_debug_agent: amd-staging
rocSOLVER: develop
rocSPARSE: develop
ROCT-Thunk-Interface: master
rocThrust: develop
roctracer: amd-master
rocWMMA: develop
rpp: master
- name: componentsFailureOkay
type: object
@@ -95,10 +70,7 @@ steps:
displayName: Extract ${{ parameters.componentName }}
inputs:
archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
${{ if parameters.extractToMnt }}:
destinationFolder: '/mnt/rocm'
${{ else }}:
destinationFolder: '$(Agent.BuildDirectory)/rocm'
destinationFolder: '$(Agent.BuildDirectory)/rocm'
cleanDestinationFolder: false
overwriteExistingFiles: true
- task: DeleteFiles@1

View File

@@ -5,15 +5,9 @@ parameters:
- name: extraBuildFlags
type: string
default: ''
- name: multithreadFlag
type: string
default: ''
- name: cmakeBuildDir
type: string
default: 'build'
- name: cmakeSourceDir
type: string
default: '..'
- name: cmakeTarget
type: string
default: 'install'
@@ -23,12 +17,6 @@ parameters:
- name: installDir
type: string
default: '$(Build.BinariesDirectory)'
- name: customInstallPath
type: boolean
default: true
- name: installEnabled
type: boolean
default: true
steps:
# create workingDirectory if it does not exist and change into it
@@ -37,27 +25,19 @@ steps:
displayName: '${{parameters.componentName }} CMake Flags'
inputs:
workingDirectory: ${{ parameters.cmakeBuildDir }}
${{ if eq(parameters.customInstallPath, true) }}:
cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
${{ else }}:
cmakeArgs: ${{ parameters.extraBuildFlags }} ..
- script: df -h
displayName: Disk space before build
cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ..
# equivalent to running make $cmakeTargetDir from $cmakeBuildDir
# i.e., cd $cmakeBuildDir; make $cmakeTargetDir
- task: CMake@1
displayName: '${{parameters.componentName }} Build'
inputs:
workingDirectory: ${{ parameters.cmakeBuildDir }}
cmakeArgs: '--build ${{ parameters.cmakeTargetDir }} ${{ parameters.multithreadFlag }}'
cmakeArgs: '--build ${{ parameters.cmakeTargetDir }}'
retryCountOnTaskFailure: 10
- script: df -h
displayName: Disk space after build
# equivalent to running make $cmakeTarget from $cmakeBuildDir
# e.g., make install
- ${{ if eq(parameters.installEnabled, true) }}:
- task: CMake@1
displayName: '${{parameters.componentName }} ${{ parameters.cmakeTarget }}'
inputs:
workingDirectory: ${{ parameters.cmakeBuildDir }}
cmakeArgs: '--build ${{ parameters.cmakeTargetDir }} --target ${{ parameters.cmakeTarget }}'
- task: CMake@1
displayName: '${{parameters.componentName }} ${{ parameters.cmakeTarget }}'
inputs:
workingDirectory: ${{ parameters.cmakeBuildDir }}
cmakeArgs: '--build ${{ parameters.cmakeTargetDir }} --target ${{ parameters.cmakeTarget }}'

View File

@@ -12,31 +12,23 @@ steps:
displayName: 'sudo apt-get update'
inputs:
targetType: inline
script: sudo apt-get --yes update
env:
DEBIAN_FRONTEND: noninteractive
script: sudo apt-get update
- task: Bash@3
displayName: 'sudo apt-get upgrade'
inputs:
targetType: inline
script: sudo apt-get --yes upgrade
env:
DEBIAN_FRONTEND: noninteractive
script: sudo apt-get update
- task: Bash@3
displayName: 'sudo apt-get fix'
inputs:
targetType: inline
script: sudo apt --yes --fix-broken install
env:
DEBIAN_FRONTEND: noninteractive
- ${{ if gt(length(parameters.aptPackages), 0) }}:
- task: Bash@3
displayName: 'sudo apt-get install ...'
inputs:
targetType: inline
script: sudo apt-get --yes install ${{ join(' ', parameters.aptPackages) }}
env:
DEBIAN_FRONTEND: noninteractive
- ${{ if gt(length(parameters.pipModules), 0) }}:
- task: Bash@3
displayName: 'pip install ...'

View File

@@ -11,9 +11,6 @@ parameters:
- staging
- tag-builds
- fixed
- name: extractToMnt
type: boolean
default: false
# required values for fixed selection
- name: fixedPipelineIdentifier
type: string
@@ -26,112 +23,70 @@ parameters:
- name: stagingPipelineIdentifiers
type: object
default:
AMDMIGraphX: $(amdmigraphx-pipeline-id)
amdsmi: $(amdsmi-pipeline-id)
aomp-extras: $(aomp-extras-pipeline-id)
aomp: $(aomp-pipeline-id)
aomp-extras: $(aomp-extras-pipeline-id)
AMDMIGraphX: $(amdmigraphx-pipeline-id)
clr: $(clr-pipeline-id)
composable_kernel: $(composable-kernel-pipeline-id)
half: $(half-pipeline-id)
HIP: $(hip-pipeline-id)
hipBLAS: $(hipblas-pipeline-id)
hipBLASLt: $(hipblaslt-pipeline-id)
hipCUB: $(hipcub-pipeline-id)
hipFFT: $(hipfft-pipeline-id)
hipfort: $(hipfort-pipeline-id)
HIPIFY: $(hipify-pipeline-id)
hipRAND: $(hiprand-pipeline-id)
hipSOLVER: $(hipsolver-pipeline-id)
hipSPARSE: $(hipsparse-pipeline-id)
hipSPARSELt: $(hipsparselt-pipeline-id)
hipTensor: $(hiptensor-pipeline-id)
llvm-project: $(llvm-project-pipeline-id)
MIOpen: $(miopen-pipeline-id)
MIVisionX: $(mivisionx-pipeline-id)
rccl: $(rccl-pipeline-id)
rdc: $(rdc-pipeline-id)
rocAL: $(rocal-pipeline-id)
rocALUTION: $(rocalution-pipeline-id)
rocBLAS: $(rocblas-pipeline-id)
ROCdbgapi : $(rocdbgapi-pipeline-id)
rocDecode: $(rocdecode-pipeline-id)
rocFFT: $(rocfft-pipeline-id)
ROCgdb: $(rocgdb-pipeline-id)
rocm-cmake: $(rocm-cmake-pipeline-id)
rocm-core: $(rocm-core-pipeline-id)
rocm-examples: $(rocm-examples-pipeline-id)
rocm_smi_lib: $(rocm-smi-lib-pipeline-id)
rocminfo: $(rocminfo-pipeline-id)
rocMLIR: $(rocmlir-pipeline-id)
ROCmValidationSuite: $(rocmvalidationsuite-pipeline-id)
rocm_bandwidth_test: $(rocm-bandwidth-test-pipeline-id)
rocm_smi_lib: $(rocm-smi-lib-pipeline-id)
rocPRIM: $(rocprim-pipeline-id)
rocprofiler-register: $(rocprofiler-register-pipeline-id)
rocprofiler: $(rocprofiler-pipeline-id)
ROCR-Runtime: $(rocr-runtime-pipeline-id)
rocRAND: $(rocrand-pipeline-id)
rocr_debug_agent: $(rocr-debug-agent-pipeline-id)
rocSOLVER: $(rocsolver-pipeline-id)
rocSPARSE: $(rocsparse-pipeline-id)
ROCT-Thunk-Interface: $(roct-thunk-interface-pipeline-id)
rocThrust: $(rocthrust-pipeline-id)
roctracer: $(roctracer-pipeline-id)
rocWMMA: $(rocwmma-pipeline-id)
rpp: $(rpp-pipeline-id)
- name: taggedPipelineIdentifiers
type: object
default:
AMDMIGraphX: $(amdmigraphx-tagged-pipeline-id)
amdsmi: $(amdsmi-tagged-pipeline-id)
aomp-extras: $(aomp-extras-tagged-pipeline-id)
aomp: $(aomp-tagged-pipeline-id)
aomp-extras: $(aomp-extras-tagged-pipeline-id)
AMDMIGraphX: $(amdmigraphx-tagged-pipeline-id)
clr: $(clr-tagged-pipeline-id)
composable_kernel: $(composable-kernel-tagged-pipeline-id)
half: $(half-tagged-pipeline-id)
HIP: $(hip-tagged-pipeline-id)
hipBLAS: $(hipblas-tagged-pipeline-id)
hipBLASLt: $(hipblaslt-tagged-pipeline-id)
hipCUB: $(hipcub-tagged-pipeline-id)
hipFFT: $(hipfft-tagged-pipeline-id)
hipfort: $(hipfort-tagged-pipeline-id)
HIPIFY: $(hipify-tagged-pipeline-id)
hipRAND: $(hiprand-tagged-pipeline-id)
hipSOLVER: $(hipsolver-tagged-pipeline-id)
hipSPARSE: $(hipsparse-tagged-pipeline-id)
hipSPARSELt: $(hipsparselt-tagged-pipeline-id)
hipTensor: $(hiptensor-tagged-pipeline-id)
llvm-project: $(llvm-project-tagged-pipeline-id)
MIOpen: $(miopen-tagged-pipeline-id)
MIVisionX: $(mivisionx-tagged-pipeline-id)
rccl: $(rccl-tagged-pipeline-id)
rdc: $(rdc-tagged-pipeline-id)
rocAL: $(rocal-tagged-pipeline-id)
rocALUTION: $(rocalution-tagged-pipeline-id)
rocBLAS: $(rocblas-tagged-pipeline-id)
ROCdbgapi : $(rocdbgapi-tagged-pipeline-id)
rocDecode: $(rocdecode-tagged-pipeline-id)
rocFFT: $(rocfft-tagged-pipeline-id)
ROCgdb: $(rocgdb-tagged-pipeline-id)
rocm-cmake: $(rocm-cmake-tagged-pipeline-id)
rocm-core: $(rocm-core-tagged-pipeline-id)
rocm-examples: $(rocm-examples-tagged-pipeline-id)
rocm_smi_lib: $(rocm-smi-lib-tagged-pipeline-id)
rocminfo: $(rocminfo-tagged-pipeline-id)
rocMLIR: $(rocmlir-tagged-pipeline-id)
ROCmValidationSuite: $(rocmvalidationsuite-tagged-pipeline-id)
rocm_bandwidth_test: $(rocm-bandwidth-test-tagged-pipeline-id)
rocm_smi_lib: $(rocm-smi-lib-tagged-pipeline-id)
rocPRIM: $(rocprim-tagged-pipeline-id)
rocprofiler-register: $(rocprofiler-register-tagged-pipeline-id)
rocprofiler: $(rocprofiler-tagged-pipeline-id)
ROCR-Runtime: $(rocr-runtime-tagged-pipeline-id)
rocRAND: $(rocrand-tagged-pipeline-id)
rocr_debug_agent: $(rocr-debug-agent-tagged-pipeline-id)
rocSOLVER: $(rocsolver-tagged-pipeline-id)
rocSPARSE: $(rocsparse-tagged-pipeline-id)
ROCT-Thunk-Interface: $(roct-thunk-interface-tagged-pipeline-id)
rocThrust: $(rocthrust-tagged-pipeline-id)
roctracer: $(roctracer-tagged-pipeline-id)
rocWMMA: $(rocwmma-tagged-pipeline-id)
rpp: $(rpp-tagged-pipeline-id)
# set to true if you're calling this template file multiple files in same pipeline
# only leave last call false to optimize sequence
@@ -147,45 +102,31 @@ steps:
parameters:
componentName: ${{ dependency }}
pipelineId: ${{ parameters.stagingPipelineIdentifiers[dependency] }}
extractToMnt: ${{ parameters.extractToMnt }}
- ${{ if eq(parameters.dependencySource, 'tag-builds') }}:
- template: artifact-download.yml
parameters:
componentName: ${{ dependency }}
pipelineId: ${{ parameters.taggedPipelineIdentifiers[dependency] }}
extractToMnt: ${{ parameters.extractToMnt }}
# fixed case only accepts one component at a time, so no array input
- ${{ if eq(parameters.dependencySource, 'fixed') }}:
- template: artifact-download.yml
parameters:
componentName: ${{ parameters.fixedComponentName }}
pipelineId: ${{ parameters.fixedPipelineIdentifier }}
extractToMnt: ${{ parameters.extractToMnt }}
- task: Bash@3
displayName: 'list downloaded ROCm files'
inputs:
targetType: inline
${{ if eq(parameters.extractToMnt, true) }}:
script: ls -1R /mnt/rocm
${{ else }}:
script: ls -1R $(Agent.BuildDirectory)/rocm
script: ls -1R $(Agent.BuildDirectory)/rocm
- ${{ if eq(parameters.skipLibraryLinking, false) }}:
- task: Bash@3
displayName: 'link ROCm shared libraries'
inputs:
targetType: inline
# OS ignores if the ROCm lib folder shows up more than once
${{ if eq(parameters.extractToMnt, true) }}:
script: |
echo /mnt/rocm/lib | sudo tee -a /etc/ld.so.conf
echo /mnt/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf
sudo cat /etc/ld.so.conf
sudo ldconfig -v
ldconfig -p
${{ else }}:
script: |
echo $(Agent.BuildDirectory)/rocm/lib | sudo tee -a /etc/ld.so.conf
echo $(Agent.BuildDirectory)/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf
sudo cat /etc/ld.so.conf
sudo ldconfig -v
ldconfig -p
script: |
echo $(Agent.BuildDirectory)/rocm/lib | sudo tee -a /etc/ld.so.conf
echo $(Agent.BuildDirectory)/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf
sudo cat /etc/ld.so.conf
sudo ldconfig -v
ldconfig -p

View File

@@ -21,8 +21,6 @@ variables:
value: rocm-ci_ultra_build_pool
- name: ON_PREM_BUILD_POOL
value: rocm-ci_build_pool
- name: LARGE_DISK_BUILD_POOL
value: rocm-ci_larger_base_disk_pool
- name: LATEST_RELEASE_TAG
value: rocm-6.1.0
- name: DOCKER_IMAGE_NAME

View File

@@ -3,19 +3,20 @@
version: 2
sphinx:
configuration: docs/conf.py
formats: [htmlzip]
python:
install:
- requirements: docs/sphinx/requirements.txt
build:
os: ubuntu-22.04
tools:
python: "3.10"
apt_packages:
- "doxygen"
- "gfortran" # For pre-processing fortran sources
- "graphviz" # For dot graphs in doxygen
python:
install:
- requirements: docs/sphinx/requirements.txt
sphinx:
configuration: docs/conf.py
formats: []

View File

@@ -2,7 +2,6 @@ AAC
ABI
ACE
ACEs
ACS
AccVGPR
AccVGPRs
ALU
@@ -13,7 +12,6 @@ AMDMIGraphX
AMI
AOCC
AOMP
APBDIS
APIC
APIs
APU
@@ -26,13 +24,11 @@ ATI
AddressSanitizer
AlexNet
Arb
BARs
BLAS
BMC
BitCode
Blit
Bluefield
Bootloader
CCD
CDNA
CIFAR
@@ -47,7 +43,6 @@ CPF
CPP
CPU
CPUs
Cron
CSC
CSE
CSV
@@ -67,10 +62,7 @@ CommonMark
Concretized
Conda
ConnectX
DDR
DF
DGEMM
DIMM
DKMS
DL
DMA
@@ -99,9 +91,7 @@ FFmpeg
FHS
FMA
FP
FX
Filesystem
FindDb
Flang
Fortran
Fuyu
@@ -134,7 +124,6 @@ GitHub
Gitpod
HBM
HCA
HGX
HIPCC
HIPExtension
HIPIFY
@@ -144,14 +133,12 @@ HPE
HPL
HSA
HWE
HWS
Haswell
Higgs
Hyperparameters
ICV
IDE
IDEs
IFWI
IMDb
IOMMU
IOP
@@ -161,7 +148,6 @@ IRQ
ISA
ISV
ISVs
ITL
ImageNet
InfiniBand
Inlines
@@ -173,7 +159,6 @@ JSON
Jupyter
KFD
KiB
KV
KVM
Keras
Khronos
@@ -208,7 +193,6 @@ MVFFR
Makefile
Makefiles
Matplotlib
Megatrends
Megatron
Mellanox
Mellanox's
@@ -224,7 +208,6 @@ NIC
NICs
NLI
NLP
NPKit
NPS
NSP
NUMA
@@ -254,22 +237,18 @@ OpenCV
OpenFabrics
OpenGL
OpenMP
OpenMPI
OpenSSL
OpenVX
PCC
PCI
PCIe
PEFT
PIL
PILImage
POR
PRNG
PRs
PaLM
Pageable
PeerDirect
PerfDb
Perfetto
PipelineParallel
PnP
@@ -308,7 +287,6 @@ SBIOS
SCA
SDK
SDMA
SDPA
SDRAM
SENDMSG
SGPR
@@ -330,12 +308,10 @@ SRAMECC
SVD
SWE
SerDes
ShareGPT
Shlens
Skylake
Softmax
Spack
SplitK
Supermicro
Szegedy
TCA
@@ -346,12 +322,8 @@ TCP
TCR
TF
TFLOPS
TP
TPU
TPUs
TSME
Tagram
TensileLite
TensorBoard
TensorFlow
TensorParallel
@@ -372,7 +344,6 @@ USM
UTCL
UTIL
Uncached
Unittests
Unhandled
VALU
VBIOS
@@ -461,7 +432,6 @@ cuLIB
cuRAND
cuSOLVER
cuSPARSE
cTDP
dataset
datasets
dataspace
@@ -496,7 +466,6 @@ executables
ffmpeg
filesystem
fortran
fp
galb
gcc
gdb
@@ -510,7 +479,6 @@ gzip
heterogenous
hipBLAS
hipBLASLt
hipBLASLt's
hipCUB
hipFFT
hipLIB
@@ -527,8 +495,6 @@ hipfort
hipify
hipsolver
hipsparse
hotspotting
hpc
hpp
hsa
hsakmt
@@ -536,7 +502,6 @@ hyperparameter
ib_core
inband
incrementing
inductor
inferencing
inflight
init
@@ -594,8 +559,6 @@ prebuilt
precompiled
prefetch
prefetchable
prefill
prefills
preprocess
preprocessed
preprocessing
@@ -668,7 +631,6 @@ subexpression
subfolder
subfolders
supercomputing
td
tensorfloat
th
tokenization
@@ -720,8 +682,7 @@ writebacks
wrreq
wzo
xargs
xGMI
xz
yaml
ysvmadyb
zypper
zypper

9101
CHANGELOG.md Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
MIT License
Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

View File

@@ -76,8 +76,8 @@ The Build time will reduce significantly if we limit the GPU Architecture/s agai
mkdir -p ~/WORKSPACE/ # Or any folder name other than WORKSPACE
cd ~/WORKSPACE/
export ROCM_VERSION=6.1.0 # or 6.1.1 6.1.2
~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.1.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
export ROCM_VERSION=6.1.0 # or 6.1.1
~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.1.x -m rocm-build/rocm-${ROCM_VERSION}.xml
~/bin/repo sync
# --------------------------------------
@@ -86,9 +86,9 @@ export ROCM_VERSION=6.1.0 # or 6.1.1 6.1.2
# Option 1: Start a docker container
# Pulling required base docker images:
# Ubuntu20.04 built from ROCm/tools/rocm-build/docker/ubuntu20/Dockerfile
# Ubuntu20.04 built from ROCm/rocm-build/docker/ubuntu20/Dockerfile
docker pull rocm/rocm-build-ubuntu-20.04:6.1
# Ubuntu22.04 built from ROCm/tools/rocm-build/docker/ubuntu22/Dockerfile
# Ubuntu22.04 built from ROCm/rocm-build/docker/ubuntu22/Dockerfile
docker pull rocm/rocm-build-ubuntu-22.04:6.1
# Start docker container and mount the source code folder:
@@ -107,10 +107,10 @@ docker run -ti \
# Option 2: Install required packages into the host machine
# For ubuntu20.04 system
cd ROCm/tools/rocm-build/docker/ubuntu20
cd ROCm/rocm-build/docker/ubuntu20
bash install-prerequisites.sh
# For ubuntu22.04 system
cd ROCm/tools/rocm-build/docker/ubuntu22
cd ROCm/rocm-build/docker/ubuntu22
bash install-prerequisities.sh
# --------------------------------------
@@ -126,13 +126,13 @@ export GPU_ARCHS="gfx940;gfx941;gfx942" # Example
# Pick and run build commands in the docker container:
# Build rocm-dev packages
make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} rocm-dev
make -f ROCm/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} rocm-dev
# Build all ROCm packages
make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} all
make -f ROCm/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} all
# list all ROCm components to find required components
make -f ROCm/tools/rocm-build/ROCm.mk list_components
make -f ROCm/rocm-build/ROCm.mk list_components
# Build a single ROCm packages
make -f ROCm/tools/rocm-build/ROCm.mk T_rocblas
make -f ROCm/rocm-build/ROCm.mk T_rocblas
# Find built packages in ubuntu20.04:
out/ubuntu-20.04/20.04/deb/
@@ -151,7 +151,7 @@ out/ubuntu-22.04/22.04/logs/rocblas.inprogress # Example
out/ubuntu-22.04/22.04/logs/rocblas # Example
```
Note: [Overview for ROCm.mk](tools/rocm-build/README.md)
Note: [Overview for ROCm.mk](rocm-build/README.md)
## ROCm documentation

View File

@@ -1,4 +1,4 @@
# ROCm 6.1.5 release notes
# ROCm 6.1.2 release notes
<!-- Do not edit this file! This file is autogenerated with -->
<!-- tools/autotag/tag_script.py -->
@@ -11,369 +11,136 @@
<!-- spellcheck-disable -->
The release notes provide a summary of notable changes since the previous ROCm release.
ROCm 6.1.2 includes enhancements to SMI tools and improvements to some libraries.
- [Release highlights](#release-highlights)
### OS support
- [Operating system support](#operating-system-support)
ROCm 6.1.2 has been tested against a pre-release version of Ubuntu 22.04.5 (kernel: 5.15 [GA], 6.8 [HWE]).
- [ROCm components versioning](#rocm-components)
### AMD SMI
- [ROCm known issues](#rocm-known-issues)
AMD SMI for ROCm 6.1.2
- [ROCm upcoming changes](#rocm-upcoming-changes)
#### Additions
* Added process isolation and clean shader APIs and CLI commands.
* `amdsmi_get_gpu_process_isolation()`
* `amdsmi_set_gpu_process_isolation()`
* `amdsmi_set_gpu_clear_sram_data()`
* Added the `MIN_POWER` metric to output provided by `amd-smi static --limit`.
#### Optimizations
* Updated the `amd-smi monitor --pcie` output to prevent delays with the `monitor` command.
#### Changes
* Updated `amismi_get_power_cap_info` to return values in uW instead of W.
* Updated Python library return types for `amdsmi_get_gpu_memory_reserved_pages` and `amdsmi_get_gpu_bad_page_info`.
* Updated the output of `amd-smi metric --ecc-blocks` to show counters available from blocks.
#### Fixes
* `amdsmi_get_gpu_board_info()` no longer returns junk character strings.
* `amd-smi metric --power` now correctly details power output for RDNA3, RDNA2, and MI1x devices.
* Fixed the `amdsmitstReadWrite.TestPowerCapReadWrite` test for RDNA3, RDNA2, and MI100 devices.
* Fixed an issue with the `amdsmi_get_gpu_memory_reserved_pages` and `amdsmi_get_gpu_bad_page_info` Python interface calls.
#### Removals
* Removed the `amdsmi_get_gpu_process_info` API from the Python library. It was removed from the C library in an earlier release.
```{note}
If youre using Radeon™ PRO or Radeon GPUs in a workstation setting with a display connected, see the [Use ROCm on Radeon GPUs](https://rocm.docs.amd.com/projects/radeon/en/docs-6.1.3/docs/compatibility/native_linux/native_linux_compatibility.html)
documentation to verify compatibility and system requirements.
See the AMD SMI [detailed changelog](https://github.com/ROCm/amdsmi/blob/rocm-6.1.x/CHANGELOG.md) with code samples for more information.
```
## Release highlights
### ROCm SMI
The following is the notable improvement in ROCm 6.1.5.
ROCm SMI for ROCm 6.1.2
### Fixed compatibility issue with third-party profiling tools using ROCprofiler-SDK backend
#### Additions
[rocprofiler-register](https://github.com/ROCm/rocprofiler-register) library has resolved the profiling tools compatibility issue where applications potentially failed with the error message `rocprofiler_configure not found. Tried to dlopen`. This prevents a failure when profiling ROCm 6.1 applications using third-party profiling tools upgraded to use [ROCprofiler-SDK](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/) in ROCm 6.2.0 or later.
* Added the ring hang event to the `amdsmi_evt_notification_type_t` enum.
## Operating system support
#### Fixes
ROCm 6.1.5 no longer supports CentOS 7.9. All other operating system support from ROCm 6.1.2 remains unchanged in this release.
* Fixed an issue causing ROCm SMI to incorrectly report GPU utilization for RDNA3 GPUs. See the issue on [GitHub](https://github.com/ROCm/ROCm/issues/3112).
* Fixed the parsing of `pp_od_clk_voltage` in `get_od_clk_volt_info` to work better with MI-series hardware.
ROCm 6.1.5 requires the [native package manager](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.1.5/how-to/native-install/index.html).
## Library changes in ROCm 6.1.2
See the [Compatibility
matrix](../../docs/compatibility/compatibility-matrix.rst)
for more information about operating system compatibility.
| Library | Version |
|---------|---------|
| AMDMIGraphX | [2.9](https://github.com/ROCm/AMDMIGraphX/releases/tag/rocm-6.1.2) |
| composable_kernel | [0.2.0](https://github.com/ROCm/composable_kernel/releases/tag/rocm-6.1.2) |
| hipBLAS | [2.1.0](https://github.com/ROCm/hipBLAS/releases/tag/rocm-6.1.2) |
| hipBLASLt | [0.7.0](https://github.com/ROCm/hipBLASLt/releases/tag/rocm-6.1.2) |
| hipCUB | [3.1.0](https://github.com/ROCm/hipCUB/releases/tag/rocm-6.1.2) |
| hipFFT | [1.0.14](https://github.com/ROCm/hipFFT/releases/tag/rocm-6.1.2) |
| hipRAND | [2.10.17](https://github.com/ROCm/hipRAND/releases/tag/rocm-6.1.2) |
| hipSOLVER | [2.1.1](https://github.com/ROCm/hipSOLVER/releases/tag/rocm-6.1.2) |
| hipSPARSE | [3.0.1](https://github.com/ROCm/hipSPARSE/releases/tag/rocm-6.1.2) |
| hipSPARSELt | [0.2.0](https://github.com/ROCm/hipSPARSELt/releases/tag/rocm-6.1.2) |
| hipTensor | [1.2.0](https://github.com/ROCm/hipTensor/releases/tag/rocm-6.1.2) |
| MIOpen | [3.1.0](https://github.com/ROCm/MIOpen/releases/tag/rocm-6.1.2) |
| MIVisionX | [2.5.0](https://github.com/ROCm/MIVisionX/releases/tag/rocm-6.1.2) |
| rccl | [2.18.6](https://github.com/ROCm/rccl/releases/tag/rocm-6.1.2) |
| rocALUTION | [3.1.1](https://github.com/ROCm/rocALUTION/releases/tag/rocm-6.1.2) |
| rocBLAS | 4.1.0 ⇒ [4.1.2](https://github.com/ROCm/rocBLAS/releases/tag/rocm-6.1.2) |
| rocDecode | 0.5.0 ⇒ [0.6.0](https://github.com/ROCm/rocDecode/releases/tag/rocm-6.1.2) |
| rocFFT | [1.0.27](https://github.com/ROCm/rocFFT/releases/tag/rocm-6.1.2) |
| rocm-cmake | [0.12.0](https://github.com/ROCm/rocm-cmake/releases/tag/rocm-6.1.2) |
| rocPRIM | [3.1.0](https://github.com/ROCm/rocPRIM/releases/tag/rocm-6.1.2) |
| rocRAND | [3.0.1](https://github.com/ROCm/rocRAND/releases/tag/rocm-6.1.2) |
| rocSOLVER | [3.25.0](https://github.com/ROCm/rocSOLVER/releases/tag/rocm-6.1.2) |
| rocSPARSE | [3.1.2](https://github.com/ROCm/rocSPARSE/releases/tag/rocm-6.1.2) |
| rocThrust | [3.0.1](https://github.com/ROCm/rocThrust/releases/tag/rocm-6.1.2) |
| rocWMMA | [1.4.0](https://github.com/ROCm/rocWMMA/releases/tag/rocm-6.1.2) |
| rpp | [1.5.0](https://github.com/ROCm/rpp/releases/tag/rocm-6.1.2) |
| Tensile | [4.40.0](https://github.com/ROCm/Tensile/releases/tag/rocm-6.1.2) |
## ROCm components
### RCCL
The following table lists the versions of ROCm components for ROCm 6.1.5.
Click {fab}`github` to go to the component's source code on GitHub.
RCCL 2.18.6 for ROCm 6.1.2
<div class="pst-scrollable-table-container">
<table id="rocm-rn-components" class="table">
<thead>
<tr>
<th>Category</th>
<th>Group</th>
<th>Name</th>
<th>Version</th>
<th></th>
</tr>
</thead>
<colgroup>
<col span="1">
<col span="1">
</colgroup>
<tbody class="rocm-components-libs rocm-components-ml">
<tr>
<th rowspan="6">Libraries</th>
<th rowspan="6">Machine learning and computer vision</th>
<td><a href="https://rocm.docs.amd.com/projects/composable_kernel/en/docs-6.1.5">Composable Kernel</a>
</td>
<td>1.1.0</td>
<td><a href="https://github.com/ROCm/composable_kernel/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/AMDMIGraphX/en/docs-6.1.5">MIGraphX</a></td>
<td>2.9</td>
<td><a href="https://github.com/ROCm/AMDMIGraphX/releases/tag/rocm-6.1.5"><i class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/MIOpen/en/docs-6.1.5">MIOpen</a></td>
<td>3.1.0</td>
<td><a href="https://github.com/ROCm/MIOpen/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/MIVisionX/en/docs-6.1.5">MIVisionX</a></td>
<td>2.5.0</td>
<td><a href="https://github.com/ROCm/MIVisionX/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rocDecode/en/docs-6.1.5">rocDecode</a></td>
<td>0.6.0</td>
<td><a href="https://github.com/ROCm/rocDecode/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rpp/en/docs-6.1.5">RPP</a></td>
<td>1.5.0</td>
<td><a href="https://github.com/ROCm/rpp/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
</tbody>
<tbody class="rocm-components-libs rocm-components-communication">
<tr>
<th rowspan="1"></th>
<th rowspan="1">Communication</th>
<td><a href="https://rocm.docs.amd.com/projects/rccl/en/docs-6.1.5">RCCL</a></td>
<td>2.18.6</td>
<td><a href="https://github.com/ROCm/rccl/releases/tag/rocm-6.1.5"><i class="fab fa-github fa-lg"></i></a></td>
</tr>
</tbody>
<tbody class="rocm-components-libs rocm-components-math tbody-reverse-zebra">
<tr>
<th rowspan="16"></th>
<th rowspan="16">Math</th>
<td><a href="https://rocm.docs.amd.com/projects/hipBLAS/en/docs-6.1.5">hipBLAS</a></td>
<td>2.1.0</td>
<td><a href="https://github.com/ROCm/hipBLAS/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/hipBLASLt/en/docs-6.1.5">hipBLASLt</a></td>
<td>0.7.0</td>
<td><a href="https://github.com/ROCm/hipBLASLt/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/hipFFT/en/docs-6.1.5">hipFFT</a></td>
<td>1.0.14</td>
<td><a href="https://github.com/ROCm/hipFFT/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/hipfort/en/docs-6.1.5">hipfort</a></td>
<td>0.4.0</td>
<td><a href="https://github.com/ROCm/hipfort/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/hipRAND/en/docs-6.1.5">hipRAND</a></td>
<td>2.10.16</td>
<td><a href="https://github.com/ROCm/hipRAND/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/hipSOLVER/en/docs-6.1.5">hipSOLVER</a></td>
<td>2.1.1</td>
<td><a href="https://github.com/ROCm/hipSOLVER/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/hipSPARSE/en/docs-6.1.5">hipSPARSE</a></td>
<td>3.0.1</td>
<td><a href="https://github.com/ROCm/hipSPARSE/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/hipSPARSELt/en/docs-6.1.5">hipSPARSELt</a></td>
<td>0.2.0</td>
<td><a href="https://github.com/ROCm/hipSPARSELt/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rocALUTION/en/docs-6.1.5">rocALUTION</a></td>
<td>3.1.1</td>
<td><a href="https://github.com/ROCm/rocALUTION/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rocBLAS/en/docs-6.1.5">rocBLAS</a></td>
<td>4.1.2</td>
<td><a href="https://github.com/ROCm/rocBLAS/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rocFFT/en/docs-6.1.5">rocFFT</a></td>
<td>1.0.27</td>
<td><a href="https://github.com/ROCm/rocFFT/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rocRAND/en/docs-6.1.5">rocRAND</a></td>
<td>3.0.1</td>
<td><a href="https://github.com/ROCm/rocRAND/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rocSOLVER/en/docs-6.1.5">rocSOLVER</a></td>
<td>3.25.0</td>
<td><a href="https://github.com/ROCm/rocSOLVER/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rocSPARSE/en/docs-6.1.5">rocSPARSE</a></td>
<td>3.1.2</td>
<td><a href="https://github.com/ROCm/rocSPARSE/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rocWMMA/en/docs-6.1.5">rocWMMA</a></td>
<td>1.4.0</td>
<td><a href="https://github.com/ROCm/rocWMMA/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://github.com/ROCm/tensile/">Tensile</a></td>
<td>4.40.0</td>
<td><a href="https://github.com/ROCm/tensile/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
</tbody>
<tbody class="rocm-components-libs rocm-components-primitives tbody-reverse-zebra">
<tr>
<th rowspan="4"></th>
<th rowspan="4">Primitives</th>
<td><a href="https://rocm.docs.amd.com/projects/hipCUB/en/docs-6.1.5">hipCUB</a></td>
<td>3.1.0</td>
<td><a href="https://github.com/ROCm/hipCUB/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/hipTensor/en/docs-6.1.5">hipTensor</a></td>
<td>1.2.0</td>
<td><a href="https://github.com/ROCm/hipTensor/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rocPRIM/en/docs-6.1.5">rocPRIM</a></td>
<td>3.1.0</td>
<td><a href="https://github.com/ROCm/rocPRIM/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rocThrust/en/docs-6.1.5">rocThrust</a></td>
<td>3.0.1</td>
<td><a href="https://github.com/ROCm/rocThrust/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
</tbody>
<tbody class="rocm-components-tools rocm-components-system tbody-reverse-zebra">
<tr>
<th rowspan="5">Tools</th>
<th rowspan="5">System management</th>
<td><a href="https://rocm.docs.amd.com/projects/amdsmi/en/docs-6.1.5">AMD SMI</a></td>
<td>24.5.1</td>
<td><a href="https://github.com/ROCm/amdsmi/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rocminfo/en/docs-6.1.5">rocminfo</a></td>
<td>1.0.0</td>
<td><a href="https://github.com/ROCm/rocminfo/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rdc/en/docs-6.1.5">ROCm Data Center Tool</a></td>
<td>0.3.0</td>
<td><a href="https://github.com/ROCm/rdc/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rocm_smi_lib/en/docs-6.1.5">ROCm SMI</a></td>
<td>7.2.0</td>
<td><a href="https://github.com/ROCm/rocm_smi_lib/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/docs-6.1.5">ROCm Validation Suite</a></td>
<td>1.0.0</td>
<td><a href="https://github.com/ROCm/ROCmValidationSuite/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
</tbody>
<tbody class="rocm-components-tools rocm-components-perf">
<tr>
<th rowspan="3"></th>
<th rowspan="3">Performance</th>
<td><a href="https://rocm.docs.amd.com/projects/rocm_bandwidth_test/en/docs-6.1.5">ROCm Bandwidth
Test</a></td>
<td>1.4.0</td>
<td><a href="https://github.com/ROCm/rocm_bandwidth_test/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rocprofiler/en/docs-6.1.5/">ROCProfiler</a></td>
<td>2.0.0</td>
<td><a href="https://github.com/ROCm/ROCProfiler/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr >
<td><a href="https://rocm.docs.amd.com/projects/roctracer/en/docs-6.1.5/">ROCTracer</a></td>
<td>4.1.0</td>
<td><a href="https://github.com/ROCm/ROCTracer/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
</tbody>
<tbody class="rocm-components-tools rocm-components-dev tbody-reverse-zebra">
<tr>
<th rowspan="5"></th>
<th rowspan="5">Development</th>
<td><a href="https://rocm.docs.amd.com/projects/HIPIFY/en/docs-6.1.5/">HIPIFY</a></td>
<td>17.0.0</td>
<td><a href="https://github.com/ROCm/HIPIFY/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/ROCdbgapi/en/docs-6.1.5">ROCdbgapi</a></td>
<td>0.71.0</td>
<td><a href="https://github.com/ROCm/ROCdbgapi/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/ROCmCMakeBuildTools/en/docs-6.1.5/">ROCm CMake</a></td>
<td>0.12.0</td>
<td><a href="https://github.com/ROCm/rocm-cmake/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/ROCgdb/en/docs-6.1.5">ROCm Debugger (ROCgdb)</a>
</td>
<td>14.1</td>
<td><a href="https://github.com/ROCm/ROCgdb/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rocr_debug_agent/en/docs-6.1.5">ROCr Debug Agent</a>
</td>
<td>2.0.3</td>
<td><a href="https://github.com/ROCm/rocr_debug_agent/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
</tbody>
<tbody class="rocm-components-compilers">
<tr>
<th rowspan="2" colspan="2">Compilers</th>
<td><a href="https://rocm.docs.amd.com/projects/HIPCC/en/docs-6.1.5">HIPCC</a></td>
<td>1.0.0</td>
<td><a href="https://github.com/ROCm/llvm-project/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://github.com/ROCm/llvm-project/">llvm-project</a></td>
<td>17.0.0</td>
<td><a href="https://github.com/ROCm/llvm-project/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
</tbody>
<tbody class="rocm-components-runtimes">
<tr>
<th rowspan="2" colspan="2">Runtimes</th>
<td><a href="https://rocm.docs.amd.com/projects/HIP/en/docs-6.1.5">HIP</a></td>
<td>6.1</td>
<td><a href="https://github.com/ROCm/HIP/releases/tag/rocm-6.1.5"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/ROCR-Runtime/en/docs-6.1.5">ROCr Runtime</a></td>
<td>1.13.0</td>
<td><a href="https://github.com/ROCm/ROCR-Runtime"><i
class="fab fa-github fa-lg"></i></a></td>
</tr>
</tbody>
</table>
</div>
#### Changes
## ROCm known issues
* Reduced `NCCL_TOPO_MAX_NODES` to limit stack usage and avoid stack overflow.
ROCm known issues are tracked on [GitHub](https://github.com/ROCm/ROCm/labels/Verified%20Issue).
### rocBLAS
## ROCm upcoming changes
rocBLAS 4.1.2 for ROCm 6.1.2
#### Optimizations
* Tuned BBS TN and TT operations on the CDNA3 architecture.
#### Fixes
* Fixed an issue related to obtaining solutions for BF16 TT operations.
### rocDecode
rocDecode 0.6.0 for ROCm 6.1.2
#### Additions
* Added support for FFmpeg v5.x.
#### Optimizations
* Updated error checking in the `rocDecode-setup.py` script.
#### Changes
* Updated core dependencies.
* Updated to support the use of public LibVA headers.
#### Fixes
* Fixed some package dependencies.
## Upcoming changes
* A future release will enable the use of HIPCC compiled binaries `hipcc.bin` and `hipconfig.bin` by default. No action is needed by users; you may continue calling high-level Perl scripts `hipcc` and `hipconfig`. `hipcc.bin` and `hipconfig.bin` will be invoked by the high-level Perl scripts. To revert to the previous behavior and invoke `hipcc.pl` and `hipconfig.pl`, set the `HIP_USE_PERL_SCRIPTS` environment variable to `1`.
* A subsequent release will remove high-level HIPCC Perl scripts from `hipcc` and `hipconfig`. This release will remove the `HIP_USE_PERL_SCRIPTS` environment variable. It will rename `hipcc.bin` and `hipconfig.bin` to `hipcc` and `hipconfig` respectively. No action is needed by the users. To revert to the previous behavior, invoke `hipcc.pl` and `hipconfig.pl` explicitly.

View File

@@ -77,7 +77,8 @@ Obtain the value of `gpu-arch` by running the following command:
[//]: # (dated link below, needs updating)
See the complete list of [compiler command-line references](https://github.com/ROCm/llvm-project/blob/amd-staging/openmp/docs/CommandLineArgumentReference.rst).
See the complete list of compiler command-line references
[here](https://github.com/ROCm/llvm-project/blob/amd-stg-open/clang/docs/CommandGuide/clang.rst).
### Using `rocprof` with OpenMP

View File

@@ -17,7 +17,7 @@ following section.
## ROCm component licenses
ROCm is released by Advanced Micro Devices, Inc. (AMD) and is licensed per component separately.
ROCm is released by Advanced Micro Devices, Inc. and is licensed per component separately.
The following table is a list of ROCm components with links to their respective license
terms. These components may include third party components subject to
additional licenses. Please review individual repositories for more information.
@@ -25,71 +25,66 @@ additional licenses. Please review individual repositories for more information.
<!-- spellcheck-disable -->
| Component | License |
|:---------------------|:-------------------------|
| [HIP](https://github.com/ROCm/HIP/) | [MIT](https://github.com/ROCm/HIP/blob/develop/LICENSE.txt) |
| [HIPCC](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc) | [MIT](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/hipcc/LICENSE.txt) |
| [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
| [AMDMIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/MIOpen/blob/develop/LICENSE.txt) |
| [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/develop/LICENSE.txt) |
| [AMD Common Language Runtime (CLR)](https://github.com/ROCm/clr) | [MIT](https://github.com/ROCm/clr/blob/develop/LICENCE) |
| [ROCm-Core](https://github.com/ROCm/rocm-core) | [MIT](https://github.com/ROCm/rocm-core/blob/master/copyright) |
| [hipamd](https://github.com/ROCm/clr/tree/develop/hipamd) | [MIT](https://github.com/ROCm/clr/blob/develop/hipamd/LICENSE.txt) |
| [ROCm-OpenCL-Runtime](https://github.com/ROCm/clr/tree/develop/opencl) | [MIT](https://github.com/ROCm/clr/blob/develop/opencl/LICENSE.txt) |
| [Tensile](https://github.com/ROCm/Tensile/) | [MIT](https://github.com/ROCm/Tensile/blob/develop/LICENSE.md) |
| [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
| [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
| [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
| [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
| [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
| [ROCm-Device-Libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/LICENSE.TXT) |
| [clang-ocl](https://github.com/ROCm/clang-ocl/) | [MIT](https://github.com/ROCm/clang-ocl/blob/master/LICENSE) |
| [HIPCC](https://github.com/ROCm/HIPCC/blob/develop/LICENSE.txt) | [MIT](https://github.com/ROCm/HIPCC/blob/develop/LICENSE.txt) |
| [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
| [HIP](https://github.com/ROCm/HIP/) | [MIT](https://github.com/ROCm/HIP/blob/develop/LICENSE.txt) |
| [MIOpenGEMM](https://github.com/ROCm/MIOpenGEMM/) | [MIT](https://github.com/ROCm/MIOpenGEMM/blob/master/LICENSE.txt) |
| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/MIOpen/blob/master/LICENSE.txt) |
| [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/master/LICENSE.txt) |
| [RCP](https://github.com/GPUOpen-Tools/radeon_compute_profiler/) | [MIT](https://github.com/GPUOpen-Tools/radeon_compute_profiler/blob/master/LICENSE) |
| [ROCK-Kernel-Driver](https://github.com/ROCm/ROCK-Kernel-Driver/) | [GPL 2.0 WITH Linux-syscall-note](https://github.com/ROCm/ROCK-Kernel-Driver/blob/master/COPYING) |
| [ROCT-Thunk-Interface](https://github.com/ROCm/ROCT-Thunk-Interface/) | [MIT](https://github.com/ROCm/ROCT-Thunk-Interface/blob/master/LICENSE.md) |
| [ROCR-Runtime](https://github.com/ROCm/ROCR-Runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCR-Runtime/blob/master/LICENSE.txt) |
| [ROCR Debug Agent](https://github.com/ROCm/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocr_debug_agent/blob/amd-staging/LICENSE.txt) |
| [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
| [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
| [ROCT-Thunk-Interface](https://github.com/ROCm/ROCT-Thunk-Interface/) | [MIT](https://github.com/ROCm/ROCT-Thunk-Interface/blob/master/LICENSE.md) |
| [ROCclr](https://github.com/ROCm/ROCclr/) | [MIT](https://github.com/ROCm/ROCclr/blob/develop/LICENSE.txt) |
| [ROCdbgapi](https://github.com/ROCm/ROCdbgapi/) | [MIT](https://github.com/ROCm/ROCdbgapi/blob/amd-master/LICENSE.txt) |
| [ROCgdb](https://github.com/ROCm/ROCgdb/) | [GNU General Public License v2.0](https://github.com/ROCm/ROCgdb/blob/amd-master/COPYING) |
| [ROCm-CompilerSupport](https://github.com/ROCm/ROCm-CompilerSupport/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCm-CompilerSupport/blob/amd-stg-open/LICENSE.txt) |
| [ROCm-Device-Libs](https://github.com/ROCm/ROCm-Device-Libs/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCm-Device-Libs/blob/amd-stg-open/LICENSE.TXT) |
| [ROCm-OpenCL-Runtime/api/opencl/khronos/icd](https://github.com/KhronosGroup/OpenCL-ICD-Loader/) | [Apache 2.0](https://github.com/KhronosGroup/OpenCL-ICD-Loader/blob/main/LICENSE) |
| [ROCm-OpenCL-Runtime](https://github.com/ROCm/ROCm-OpenCL-Runtime/) | [MIT](https://github.com/ROCm/ROCm-OpenCL-Runtime/blob/develop/LICENSE.txt) |
| [ROCmValidationSuite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
| [Tensile](https://github.com/ROCm/Tensile/) | [MIT](https://github.com/ROCm/Tensile/blob/develop/LICENSE.md) |
| [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
| [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
| [atmi](https://github.com/ROCm/atmi/) | [MIT](https://github.com/ROCm/atmi/blob/master/LICENSE.txt) |
| [clang-ocl](https://github.com/ROCm/clang-ocl/) | [MIT](https://github.com/ROCm/clang-ocl/blob/master/LICENSE) |
| [flang](https://github.com/ROCm/flang/) | [Apache 2.0](https://github.com/ROCm/flang/blob/master/LICENSE.txt) |
| [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/master/LICENSE.txt) |
| [hipBLAS](https://github.com/ROCm/hipBLAS/) | [MIT](https://github.com/ROCm/hipBLAS/blob/develop/LICENSE.md) |
| [hipBLASLt](https://github.com/ROCm/hipBLASLt/) | [MIT](https://github.com/ROCm/hipBLASLt/blob/develop/LICENSE.md) |
| [hipCUB](https://github.com/ROCm/hipCUB/) | [Custom](https://github.com/ROCm/hipCUB/blob/develop/LICENSE.txt) |
| [hipFFT](https://github.com/ROCm/hipFFT/) | [MIT](https://github.com/ROCm/hipFFT/blob/develop/LICENSE.md) |
| [hipFORT](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/develop/LICENSE) |
| [hipRAND](https://github.com/ROCm/hipRAND/) | [MIT](https://github.com/ROCm/hipRAND/blob/develop/LICENSE.txt) |
| [hipSOLVER](https://github.com/ROCm/hipSOLVER/) | [MIT](https://github.com/ROCm/hipSOLVER/blob/develop/LICENSE.md) |
| [hipSPARSE](https://github.com/ROCm/hipSPARSE/) | [MIT](https://github.com/ROCm/hipSPARSE/blob/develop/LICENSE.md) |
| [hipSPARSELt](https://github.com/ROCm/hipSPARSELt/) | [MIT](https://github.com/ROCm/hipSPARSELt/blob/develop/LICENSE.md) |
| [hipSPARSE](https://github.com/ROCm/hipSPARSE/) | [MIT](https://github.com/ROCm/hipSPARSE/blob/develop/LICENSE.md) |
| [hipTensor](https://github.com/ROCm/hipTensor) | [MIT](https://github.com/ROCm/hipTensor/blob/develop/LICENSE) |
| [rocAL](https://github.com/ROCm/rocAL) | [MIT](https://github.com/ROCm/rocAL/blob/develop/LICENSE.txt) |
| [hipamd](https://github.com/ROCm/hipamd/) | [MIT](https://github.com/ROCm/hipamd/blob/develop/LICENSE.txt) |
| [hipfort](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/master/LICENSE) |
| [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/main/LICENSE.TXT) |
| [rccl](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
| [rdc](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/master/LICENSE) |
| [rocALUTION](https://github.com/ROCm/rocALUTION/) | [MIT](https://github.com/ROCm/rocALUTION/blob/develop/LICENSE.md) |
| [rocBLAS](https://github.com/ROCm/rocBLAS/) | [MIT](https://github.com/ROCm/rocBLAS/blob/develop/LICENSE.md) |
| [rocDecode](https://github.com/ROCm/rocDecode) | [MIT](https://github.com/ROCm/rocDecode/blob/develop/LICENSE) |
| [rocFFT](https://github.com/ROCm/rocFFT/) | [MIT](https://github.com/ROCm/rocFFT/blob/develop/LICENSE.md) |
| [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
| [ROCm Performance Primitives (RPP)](https://github.com/ROCm/rpp) | [MIT](https://github.com/ROCm/rpp/blob/develop/LICENSE) |
| [rocRAND](https://github.com/ROCm/rocRAND/) | [MIT](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt) |
| [rocSOLVER](https://github.com/ROCm/rocSOLVER/) | [BSD-2-Clause](https://github.com/ROCm/rocSOLVER/blob/develop/LICENSE.md) |
| [rocSPARSE](https://github.com/ROCm/rocSPARSE/) | [MIT](https://github.com/ROCm/rocSPARSE/blob/develop/LICENSE.md) |
| [rocThrust](https://github.com/ROCm/rocThrust/) | [Apache 2.0](https://github.com/ROCm/rocThrust/blob/develop/LICENSE) |
| [rocWMMA](https://github.com/ROCm/rocWMMA/) | [MIT](https://github.com/ROCm/rocWMMA/blob/develop/LICENSE.md) |
| [ROCm Communication Collectives Library (RCCL)](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/develop/LICENSE) |
| [ROCm CMake](https://github.com/ROCm/rocm-cmake/) | [MIT](https://github.com/ROCm/rocm-cmake/blob/develop/LICENSE) |
| [ROCdbgapi](https://github.com/ROCm/ROCdbgapi/) | [MIT](https://github.com/ROCm/ROCdbgapi/blob/amd-staging/LICENSE.txt) |
| [ROCgdb](https://github.com/ROCm/ROCgdb/) | [GNU General Public License v2.0](https://github.com/ROCm/ROCgdb/blob/amd-master/COPYING) |
| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/develop/License.txt) |
| [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/develop/LICENSE) |
| [rocminfo](https://github.com/ROCm/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocminfo/blob/amd-staging/License.txt) |
| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-master/LICENSE) |
| [ROCTracer](https://github.com/ROCm/roctracer/) | [MIT](https://github.com/ROCm/roctracer/blob/amd-master/LICENSE) |
| [ROCm Bandwidth Test](https://github.com/ROCm/rocm_bandwidth_test/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm_bandwidth_test/blob/master/LICENSE.txt) |
| [TransferBench](https://github.com/ROCm/TransferBench) | [MIT](https://github.com/ROCm/TransferBench/blob/develop/LICENSE.md) |
| [ROCmValidationSuite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
| hsa-amd-aqlprofile | [AMD Software EULA](https://www.amd.com/en/legal/eula/amd-software-eula.html)
| [rocm-cmake](https://github.com/ROCm/rocm-cmake/) | [MIT](https://github.com/ROCm/rocm-cmake/blob/develop/LICENSE) |
| [rocm_bandwidth_test](https://github.com/ROCm/rocm_bandwidth_test/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm_bandwidth_test/blob/master/LICENSE.txt) |
| [rocm_smi_lib](https://github.com/ROCm/rocm_smi_lib/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm_smi_lib/blob/master/License.txt) |
| [rocminfo](https://github.com/ROCm/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocminfo/blob/master/License.txt) |
| [rocprofiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-master/LICENSE) |
| [rocr_debug_agent](https://github.com/ROCm/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocr_debug_agent/blob/master/LICENSE.txt) |
| [roctracer](https://github.com/ROCm/roctracer/) | [MIT](https://github.com/ROCm/roctracer/blob/amd-master/LICENSE) |
| rocm-llvm-alt | [AMD Proprietary License](https://www.amd.com/en/support/amd-software-eula)
Open sourced ROCm components are released via public GitHub
repositories, packages on [https://repo.radeon.com](https://repo.radeon.com) and other distribution channels.
Proprietary products are only available on [https://repo.radeon.com](https://repo.radeon.com). Currently, only
one component of ROCm, `rocm-llvm-alt` is governed by a proprietary license.
repositories, packages on https://repo.radeon.com and other distribution channels.
Proprietary products are only available on https://repo.radeon.com. Currently, only
one component of ROCm, rocm-llvm-alt is governed by a proprietary license.
Proprietary components are organized in a proprietary subdirectory in the package
repositories to distinguish from open sourced packages.
@@ -97,7 +92,7 @@ repositories to distinguish from open sourced packages.
The following additional terms and conditions apply to your use of ROCm technical documentation.
```
©2023 - 2024 Advanced Micro Devices, Inc. All rights reserved.
©2023 Advanced Micro Devices, Inc. All rights reserved.
The information presented in this document is for informational purposes only
and may contain technical inaccuracies, omissions, and typographical errors. The
@@ -130,8 +125,8 @@ companies.
:::{attention}
AQL Profiler and AOCC CPU optimization are both provided in binary form, each
subject to the license agreement enclosed in the directory for the binary available
in `/opt/rocm/share/doc/hsa-amd-aqlprofile/EULA`. By using, installing,
subject to the license agreement enclosed in the directory for the binary and is
available here: `/opt/rocm/share/doc/rocm-llvm-alt/EULA`. By using, installing,
copying or distributing AQL Profiler and/or AOCC CPU Optimizations, you agree to
the terms and conditions of this license agreement. If you do not agree to the
terms of this agreement, do not install, copy or use the AQL Profiler and/or the
@@ -139,8 +134,9 @@ AOCC CPU Optimizations.
:::
For the rest of the ROCm packages, you can find the licensing information at the
following location: `/opt/rocm/share/doc/<component-name>/` or in the locations
specified in the preceding table.
following location: `/opt/rocm/share/doc/<component-name>/`
For example, you can fetch the licensing information of the `amd_comgr`
component (Code Object Manager) from the `/opt/rocm/share/doc/amd_comgr/LICENSE.txt` file.
For example, you can fetch the licensing information of the `_amd_comgr_`
component (Code Object Manager) from the `amd_comgr` folder. A file named
`LICENSE.txt` contains the license details at:
`/opt/rocm-5.4.3/share/doc/amd_comgr/LICENSE.txt`

View File

@@ -8,121 +8,119 @@ Compatibility matrix
Use this matrix to view the ROCm compatibility across successive major and minor releases.
.. container:: format-big-table
.. csv-table::
:header: "ROCm Version", "6.1.5", "6.1.2", "6.0.0"
:header: "ROCm Version", "6.1.0", "6.0.0"
:stub-columns: 1
:doc:`Operating Systems <rocm-install-on-linux:reference/system-requirements>`,"Ubuntu 22.04.5 [#Ubuntu220405]_, 22.04.4, 22.04.3","Ubuntu 22.04.5 [#Ubuntu220405]_, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3"
,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
,"RHEL 9.4 [#red-hat94]_, 9.3, 9.2","RHEL 9.4 [#red-hat94]_, 9.3, 9.2","RHEL 9.3, 9.2"
,"RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
,"SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
,,CentOS 7.9,CentOS 7.9
,Oracle Linux 8.9 [#oracle89]_,Oracle Linux 8.9 [#oracle89]_,
,.. _architecture-support-compatibility-matrix,,
:doc:`GFX Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3
,CDNA2,CDNA2,CDNA2
,CDNA,CDNA,CDNA
,RDNA3,RDNA3,RDNA3
,RDNA2,RDNA2,RDNA2
,.. _gpu-support-compatibility-matrix,,
:doc:`GFX Card <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100,gfx1100
,gfx1030,gfx1030,gfx1030
, gfx942 [#mi300_612]_, gfx942 [#mi300_612]_, gfx942 [#mi300_600]_
,gfx90a,gfx90a,gfx90a
,gfx908,gfx908,gfx908
,,,
ECOSYSTEM SUPPORT,.. _framework-support-compatibility-matrix:,,
:doc:`PyTorch <rocm-install-on-linux:install/3rd-party/pytorch-install>`,"2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
:doc:`TensorFlow <rocm-install-on-linux:install/3rd-party/tensorflow-install>`,"2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1"
:doc:`JAX <rocm-install-on-linux:install/3rd-party/jax-install>`,0.4.26,0.4.26,0.4.26
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.17.3,1.17.3,1.14.1
,,,
THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
`UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.2.0
`UCX <https://github.com/ROCm/ucx>`_,>=1.14.1,>=1.14.1,>=1.14.1
,,,
THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix:,,
Thrust,2.1.0,2.1.0,2.0.1
CUB,2.1.0,2.1.0,2.0.1
,,,
ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix:,,
:doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0
:doc:`MIGraphX <amdmigraphx:index>`,2.9.0,2.9.0,2.8.0
:doc:`MIOpen <miopen:index>`,3.1.0,3.1.0,3.0.0
:doc:`MIVisionX <mivisionx:index>`,2.5.0,2.5.0,2.5.0
:doc:`rocDecode <rocdecode:index>`,0.6.0,0.6.0,N/A
:doc:`RPP <rpp:index>`,1.5.0,1.5.0,1.4.0
,,,
COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
:doc:`RCCL <rccl:index>`,2.18.6,2.18.6,2.18.3
,,,
MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
:doc:`hipBLAS <hipblas:index>`,2.1.0,2.1.0,2.0.0
:doc:`hipBLASLt <hipblaslt:index>`,0.7.0,0.7.0,0.6.0
:doc:`hipFFT <hipfft:index>`,1.0.14,1.0.14,1.0.13
:doc:`hipFORT <hipfort:index>`,0.4.0,0.4.0,0.4.0
:doc:`hipRAND <hiprand:index>`,2.10.16,2.10.16,2.10.16
:doc:`hipSOLVER <hipsolver:index>`,2.1.1,2.1.1,2.0.0
:doc:`hipSPARSE <hipsparse:index>`,3.0.1,3.0.1,3.0.0
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.0,0.2.0,0.1.0
:doc:`rocALUTION <rocalution:index>`,3.1.1,3.1.1,3.0.3
:doc:`rocBLAS <rocblas:index>`,4.1.2,4.1.2,4.0.0
:doc:`rocFFT <rocfft:index>`,1.0.27,1.0.27,1.0.23
:doc:`rocRAND <rocrand:index>`,3.0.1,3.0.1,2.10.17
:doc:`rocSOLVER <rocsolver:index>`,3.25.0,3.25.0,3.24.0
:doc:`rocSPARSE <rocsparse:index>`,3.1.2,3.1.2,3.0.2
:doc:`rocWMMA <rocwmma:index>`,1.4.0,1.4.0,1.3.0
`Tensile <https://github.com/ROCm/Tensile>`_,4.40.0,4.40.0,4.39.0
,,,
PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
:doc:`hipCUB <hipcub:index>`,3.1.0,3.1.0,3.0.0
:doc:`hipTensor <hiptensor:index>`,1.2.0,1.2.0,1.1.0
:doc:`rocPRIM <rocprim:index>`,3.1.0,3.1.0,3.0.0
:doc:`rocThrust <rocthrust:index>`,3.0.1,3.0.1,3.0.0
,,,
SUPPORT LIBS,,,
`hipother <https://github.com/ROCm/hipother>`_,6.1.40093,6.1.40093,6.1.32830
:doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.12.0,0.12.0,0.11.0
`rocm-core <https://github.com/ROCm/rocm-core>`_,6.1.5,6.1.2,6.0.0
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,20240125.5.08,20240125.5.08,20231016.2.245
,,,
TOOLS,.. _tools-support-compatibility-matrix:,,
:doc:`AMD SMI <amdsmi:index>`,24.5.1,24.5.1,23.4.2
:doc:`HIPIFY <hipify:index>`,17.0.0.24193,17.0.0.24193,17.0.0.23483
:doc:`ROCdbgapi <rocdbgapi:index>`,0.71.0,0.71.0,0.71.0
:doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
:doc:`ROCProfiler <rocprofiler:index>`,2.0.60105,2.0.60102,2.0.60000
`rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.3.0,0.3.0,N/A
:doc:`ROCTracer <roctracer:index>`,4.1.60105,4.1.60102,4.1.60000
:doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0
:doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0
:doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,14.1.0,14.1.0,13.2.0
:doc:`ROCm SMI <rocm_smi_lib:index>`,7.2.0,7.2.0,6.0.0
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,rocm-6.1.5,rocm-6.1.2,rocm-6.0.0
:doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.3,2.0.3,2.0.3
,,,
COMPILERS,.. _compilers-support-compatibility-matrix:,,
`clang-ocl <https://github.com/ROCm/clang-ocl>`_,0.5.0,0.5.0,0.5.0
:doc:`hipCC <hipcc:index>`,1.0.0,1.0.0,1.0.0
`Flang <https://github.com/ROCm/flang>`_,17.0.0.24193,17.0.0.24193,17.0.0.23483
:doc:`llvm-project <llvm-project:index>`,17.0.0.24193,17.0.0.24193,17.0.0.23483
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,17.0.0.24193,17.0.0.24193,17.0.0.23483
,,,
RUNTIMES,.. _runtime-support-compatibility-matrix:,,
:doc:`AMD CLR <hip:understand/amd_clr>`,6.1.40093,6.1.40093,6.1.32830
:doc:`HIP <hip:index>`,6.1.40093,6.1.40093,6.1.32830
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
:doc:`ROCR-Runtime <rocr-runtime:index>`,1.13.0,1.13.0,1.12.0
:doc:`Operating Systems <rocm-install-on-linux:reference/system-requirements>`, "Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3"
,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
,"RHEL 9.3, 9.2","RHEL 9.3, 9.2"
,"RHEL 8.9, 8.8","RHEL 8.9, 8.8"
,"SLES 15 SP5, SP4","SLES 15 SP5, SP4"
,CentOS 7.9,CentOS 7.9
,,
:doc:`GFX Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3
,CDNA2,CDNA2
,CDNA,CDNA
,RDNA3,RDNA3
,RDNA2,RDNA2
,,
:doc:`GFX Card <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100
,gfx1030,gfx1030
,gfx942 [#]_, gfx942 [#]_
,gfx90a,gfx90a
,gfx908,gfx908
,,
ECOSYSTEM SUPPORT:,,
:doc:`PyTorch <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`,"2.1, 2.0, 1.13","2.1, 2.0, 1.13"
:doc:`Tensorflow <rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`,"2.15, 2.14, 2.13","2.14, 2.13, 2.12"
:doc:`JAX <rocm-install-on-linux:how-to/3rd-party/jax-install>`,0.4.26,0.4.26
`ONNX-RT <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.17.3,1.14.1
,,
3RD PARTY COMMUNICATION LIBS:,,
`UCC <https://github.com/ROCm/ucc>`_,>=1.2.0,>=1.2.0
`UCX <https://github.com/ROCm/ucx>`_,>=1.14.1,>=1.14.1
,,
3RD PARTY ALGORITHM LIBS:,,
Thrust,2.1.0,2.0.1
CUB,2.1.0,2.0.1
,,
ML & COMPUTER VISION LIBS:,,
:doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0
:doc:`MIGraphX <amdmigraphx:index>`,2.9.0,2.8.0
:doc:`MIOpen <miopen:index>`,3.1.0,3.0.0
:doc:`MIVisionX <mivisionx:doxygen/html/index>`,2.5.0,2.5.0
:doc:`rocDecode <rocdecode:index>`,0.5.0,N/A
:doc:`RPP <rpp:index>`,1.5.0,1.4.0
,,
COMMUNICATION:,,
:doc:`rccl <rccl:index>`,2.18.6,2.18.3
,,
MATH LIBS:,,
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0
:doc:`hipBLAS <hipblas:index>`,2.1.0,2.0.0
:doc:`hipBLASLt <hipblaslt:index>`,0.7.0,0.6.0
:doc:`hipFFT <hipfft:index>`,1.0.14,1.0.13
:doc:`hipFORT <hipfort:index>`,0.4.0,0.4.0
:doc:`hipRAND <hiprand:index>`,2.10.16,2.10.16
:doc:`hipSOLVER <hipsolver:index>`,2.1.0,2.0.0
:doc:`hipSPARSE <hipsparse:index>`,3.0.1,3.0.0
:doc:`hipSPARSELt <hipsparselt:index>`,0.1.0,0.1.0
:doc:`rocALUTION <rocalution:index>`,3.1.1,3.0.3
:doc:`rocBLAS <rocblas:index>`,4.1.0,4.0.0
:doc:`rocFFT <rocfft:index>`,1.0.27,1.0.23
:doc:`rocRAND <rocrand:index>`,3.0.1,2.10.17
:doc:`rocSOLVER <rocsolver:index>`,3.25.0,3.24.0
:doc:`rocSPARSE <rocsparse:index>`,3.1.2,3.0.2
:doc:`rocWMMA <rocwmma:index>`,1.4.0,1.3.0
`Tensile <https://github.com/ROCm/Tensile>`_,4.40.0,4.39.0
,,
PRIMITIVES:,,
:doc:`hipCUB <hipcub:index>`,3.1.0,3.0.0
:doc:`hipTensor <hiptensor:index>`,1.2.0,1.1.0
:doc:`rocPRIM <rocprim:index>`,3.1.0,3.0.0
:doc:`rocThrust <rocthrust:index>`,3.0.1,3.0.0
,,
SUPPORT LIBS:,,
`hipother <https://github.com/ROCm/hipother>`_,6.1.40091,6.0.32830
`rocm-cmake <https://github.com/ROCm/rocm-cmake>`_,0.12.0,0.11.0
`rocm-core <https://github.com/ROCm/rocm-core>`_,6.1.0,6.0.0
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,20240125.3.30,20231016.2.245
,,
TOOLS:,,
:doc:`AMD SMI <amdsmi:index>`,24.4.1,23.4.2
:doc:`HIPIFY <hipify:index>`,17.0.0,17.0.0
:doc:`ROCdbgapi <rocdbgapi:index>`,0.71.0,0.71.0
`ROCdebug-Agent <https://github.com/ROCm/rocr_debug_agent>`_,2.0.3,2.0.3
:doc:`rocGDB <rocgdb:index>`,14.1.0,13.2.0
:doc:`rocProfiler <rocprofiler:profiler_home_page>`,2.0.60100,2.0.0
`rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.3.0,N/A
:doc:`rocTracer <roctracer:index>`,4.1.60100,4.1.0
`rocm_bandwidth_test <https://github.com/ROCm/rocm_bandwidth_test>`_,1.4.0,1.4.0
:doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0
`rocminfo <https://github.com/ROCm/rocminfo>`_,1.0.0,1.0.0
:doc:`ROCm SMI Lib <rocm_smi_lib:index>`,7.0.0,6.0.0
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,rocm-6.1.0,rocm-6.0.0
:doc:`TransferBench <transferbench:index>`,1.48,1.46
,,
COMPILERS:,,
`AOMP <https://github.com/ROCm/aomp>`_,17.60.0,17.60.0
`clang-ocl <https://github.com/ROCm/clang-ocl>`_,0.5.0,0.5.0
`Flang <https://github.com/ROCm/flang>`_,17.0.0.24103,17.0.0.23483
`llvm-project <https://github.com/ROCm/llvm-project>`_,17.0.0.24103,17.0.0.23483
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,17.0.0.24103,17.0.0.23483
,,
RUNTIMES:,,
:doc:`HIP <hip:index>`,6.1.40091,6.0.32830
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0
`ROCR Runtime <https://github.com/ROCm/ROCR-Runtime>`_,1.13.0,1.12.0
.. rubric:: Footnotes
.. [#] **For ROCm 6.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.3 & 8.9 and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
.. [#] **For ROCm 6.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9 and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
.. [#Ubuntu220405] Preview support of Ubuntu 22.04.5 only.
.. [#red-hat94] **For ROCm 6.1** - RHEL 9.4 is supported only on AMD Instinct MI300A.
.. [#oracle89] **For ROCm 6.1.1** - Oracle Linux is supported only on AMD Instinct MI300X.
.. [#mi300_612] **For ROCm 6.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
.. [#mi300_600] **For ROCm 6.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9 and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.

View File

@@ -416,7 +416,7 @@ description, refer to the corresponding library data type support page.
- -/✅
- -/✅
*
- hipRAND (:doc:`details <hiprand:api-reference/data-type-support>`)
- hipRAND (:doc:`details <hiprand:data-type-support>`)
- -/✅
- -/✅
- -/✅
@@ -428,7 +428,7 @@ description, refer to the corresponding library data type support page.
- ✅/✅
- ✅/✅
*
- hipCUB (:doc:`details <hipcub:api-reference/data-type-support>`)
- hipCUB (:doc:`details <hipcub:data-type-support>`)
- ✅/✅
- ✅/✅
- ✅/✅
@@ -474,7 +474,7 @@ description, refer to the corresponding library data type support page.
- -/✅
- -/✅
*
- hipRAND (:doc:`details <hiprand:api-reference/data-type-support>`)
- hipRAND (:doc:`details <hiprand:data-type-support>`)
- -/❌
- -/❌
- -/✅
@@ -492,7 +492,7 @@ description, refer to the corresponding library data type support page.
- ✅/✅
- ✅/✅
*
- hipCUB (:doc:`details <hipcub:api-reference/data-type-support>`)
- hipCUB (:doc:`details <hipcub:data-type-support>`)
- ❌/❌
- ❌/❌
- ✅/✅

View File

@@ -33,8 +33,8 @@ Units (CU). The MI250 GCD has 104 active CUs. Each compute unit is further
subdivided into four SIMD units that process SIMD instructions of 16 data
elements per instruction (for the FP64 data type). This enables the CU to
process 64 work items (a so-called “wavefront”) at a peak clock frequency of 1.7
GHz. Therefore, the theoretical maximum FP64 peak performance per GCD is 22.6
TFLOPS for vector instructions. This equates to 45.3 TFLOPS for vector instructions for both GCDs together. The MI250 compute units also provide specialized
GHz. Therefore, the theoretical maximum FP64 peak performance per GCD is 45.3
TFLOPS for vector instructions. The MI250 compute units also provide specialized
execution units (also called matrix cores), which are geared toward executing
matrix operations like matrix-matrix multiplications. For FP64, the peak
performance of these units amounts to 90.5 TFLOPS.

View File

@@ -10,7 +10,7 @@ GPU computational elements of the processor along with the lower levels of the c
The following image depicts the structure of a single XCD in the AMD Instinct MI300 accelerator series.
```{figure} ../../data/shared/xcd-sys-arch.png
```{figure} ../../data/conceptual/gpu-arch/image007.png
---
name: mi300-xcd
align: center
@@ -103,7 +103,7 @@ MI300 series system architecture showing MI300A (left) with 6 XCDs and 3 CCDs, w
## Node-level architecture
```{figure} ../../data/shared/mi300-node-level-arch.png
```{figure} ../../data/conceptual/gpu-arch/image009.png
---
name: mi300-node

View File

@@ -51,7 +51,7 @@ In HIP, pinned memory allocations are coherent by default (`hipHostMallocDefault
There are additional pinned memory flags (e.g. `hipHostMallocMapped` and `hipHostMallocPortable`).
On MI200 these options do not impact performance.
<!-- TODO: link to programming_manual#memory-allocation-flags -->
For more information, see the section *memory allocation flags* in the HIP Programming Guide: {doc}`hip:how-to/programming_manual`.
For more information, see the section *memory allocation flags* in the HIP Programming Guide: {doc}`hip:user_guide/programming_manual`.
:::
Much like how a process can be locked to a CPU core by setting affinity, a pinned memory allocator does this with the memory storage system.

View File

@@ -0,0 +1,47 @@
.. meta::
:description: Setting the number of CUs
:keywords: AMD, ROCm, cu, number of cus
.. _env-variables-reference:
*************************************************************
Setting the number of CUs
*************************************************************
When using GPUs to accelerate compute workloads, it sometimes becomes necessary
to configure the hardware's usage of Compute Units (CU). This is a more advanced
option, so please read this page before experimentation.
The GPU driver provides two environment variables to set the number of CUs used. The
first one is ``HSA_CU_MASK`` and the second one is ``ROC_GLOBAL_CU_MASK``. The main
difference is that ``ROC_GLOBAL_CU_MASK`` sets the CU mask on queues created by the HIP
or the OpenCL runtimes. While ``HSA_CU_MASK`` sets the mask on a lower level of queue
creation in the driver, this mask will also be set for queues being profiled.
The environment variables have the following syntax:
::
ID = [0-9][0-9]* ex. base 10 numbers
ID_list = (ID | ID-ID)[, (ID | ID-ID)]* ex. 0,2-4,7
GPU_list = ID_list ex. 0,2-4,7
CU_list = 0x[0-F]* | ID_list ex. 0x337F OR 0,2-4,7
CU_Set = GPU_list : CU_list ex. 0,2-4,7:0-15,32-47 OR 0,2-4,7:0x337F
HSA_CU_MASK = CU_Set [; CU_Set]* ex. 0,2-4,7:0-15,32-47; 3-9:0x337F
The GPU indices are taken post ``ROCR_VISIBLE_DEVICES`` reordering. For GPUs listed,
the listed or masked CUs will be enabled, the rest disabled. Unlisted GPUs will not
be affected, their CUs will all be enabled.
The parsing of the variable is stopped when a syntax error occurs. The erroneous set
and the ones following will be ignored. Repeating GPU or CU IDs are a syntax error.
Specifying a mask with no usable CUs (CU_list is 0x0) is a syntax error. For excluding
GPU devices use ``ROCR_VISIBLE_DEVICES``.
These environment variables only affect ROCm software, not graphics applications.
It's important to know that not all CU configurations are valid on all devices. For
instance, on devices where two CUs can be combined into a WGP (for kernels running in
WGP mode), it is not valid to disable only a single CU in a WGP. `This paper
<https://www.cs.unc.edu/~otternes/papers/rtsj2022.pdf>`_ can provide more information
about what to expect, when disabling CUs.

View File

@@ -13,9 +13,7 @@ This document provides documentation on using ROCm ASan.
For information about LLVM ASan, see the [LLVM documentation](https://clang.llvm.org/docs/AddressSanitizer.html).
:::{note}
The beta release of LLVM ASan for ROCm is currently tested and validated on Ubuntu 20.04.
:::
**Note:** The beta release of LLVM ASan for ROCm is currently tested and validated on Ubuntu 20.04.
## Compiling for ASan
@@ -36,13 +34,9 @@ Recommendations for doing this are:
Other architectures are allowed, but their device code will not be instrumented and a warning will be emitted.
:::{tip}
It is not an error to compile some files without ASan instrumentation, but doing so reduces the ability of the process to detect addressing errors. However, if the main program "`a.out`" does not directly depend on the ASan runtime (`libclang_rt.asan-x86_64.so`) after the build completes (check by running `ldd` (List Dynamic Dependencies) or `readelf`), the application will immediately report an error at runtime as described in the next section.
:::
**Note:** It is not an error to compile some files without ASan instrumentation, but doing so reduces the ability of the process to detect addressing errors. However, if the main program "`a.out`" does not directly depend on the ASan runtime (`libclang_rt.asan-x86_64.so`) after the build completes (check by running `ldd` (List Dynamic Dependencies) or `readelf`), the application will immediately report an error at runtime as described in the next section.
:::{note}
When compiling OpenMP programs with ASan instrumentation, it is currently necessary to set the environment variable `LIBRARY_PATH` to `/opt/rocm-<version>/lib/llvm/lib/asan:/opt/rocm-<version>/lib/asan`. At runtime, it may be necessary to add `/opt/rocm-<version>/lib/llvm/lib/asan` to `LD_LIBRARY_PATH`.
:::
**Note:** When compiling OpenMP programs with ASan instrumentation, it is currently necessary to set the environment variable `LIBRARY_PATH` to `/opt/rocm-<version>/lib/llvm/lib/asan:/opt/rocm-<version>/lib/asan`. At runtime, it may be necessary to add `/opt/rocm-<version>/lib/llvm/lib/asan` to `LD_LIBRARY_PATH`.
### About compilation time
@@ -98,23 +92,15 @@ If it does not appear, when executed the application will quickly output an ASan
There is an environment variable, `ASAN_OPTIONS`, that can be used to adjust the runtime behavior of the ASan runtime itself. There are more than a hundred "flags" that can be adjusted (see an old list at [flags](https://github.com/google/sanitizers/wiki/AddressSanitizerFlags)) but the default settings are correct and should be used in most cases. It must be noted that these options only affect the host ASan runtime. The device runtime only currently supports the default settings for the few relevant options.
There are three `ASAN_OPTION` flags of note.
There are two `ASAN_OPTION` flags of particular note.
* `halt_on_error=0/1 default 1`.
This tells the ASan runtime to halt the application immediately after detecting and reporting an addressing error. The default makes sense because the application has entered the realm of undefined behavior. If the developer wishes to have the application continue anyway, this option can be set to zero. However, the application and libraries should then be compiled with the additional option `-fsanitize-recover=address`. Note that the ROCm optional ASan instrumented libraries are not compiled with this option and if an error is detected within one of them, but halt_on_error is set to 0, more undefined behavior will occur.
This tells the ASan runtime to halt the application immediately after detecting and reporting an addressing error. The default makes sense because the application has entered the realm of undefined behavior. If the developer wishes to have the application continue anyway, this option can be set to zero. However, the application and libraries should then be compiled with the additional option `-fsanitize-recover=address`. Note that the ROCm optional ASan instrumented libraries are not compiled with this option and if an error is detected within one of them, but halt_on_error is set to 0, more undefined behavior will occur.
* `detect_leaks=0/1 default 1`.
This option directs the ASan runtime to enable the [Leak Sanitizer](https://clang.llvm.org/docs/LeakSanitizer.html) (LSan). For heterogeneous applications, this default results in significant output from the leak sanitizer when the application exits due to allocations made by the language runtime which are not considered to be leaks. This output can be avoided by adding `detect_leaks=0` to the `ASAN_OPTIONS`, or alternatively by producing an LSan suppression file (syntax described [here](https://github.com/google/sanitizers/wiki/AddressSanitizerLeakSanitizer)) and activating it with environment variable `LSAN_OPTIONS=suppressions=/path/to/suppression/file`. When using a suppression file, a suppression report is printed by default. The suppression report can be disabled by using the `LSAN_OPTIONS` flag `print_suppressions=0`.
* `quarantine_size_mb=N default 256`
This option defines the number of megabytes (MB) `N` of memory that the ASan runtime will hold after it is `freed` to detect use-after-free situations. This memory is unavailable for other purposes. The default of 256 MB may be too small to detect some use-after-free situations, especially given that the large size of many GPU memory allocations may push `freed` allocations out of quarantine before the attempted use.
:::{note}
Setting the value of `quarantine_size_mb` larger may enable more problematic uses to be detected, but at the cost of reducing memory available for other purposes.
:::
This option directs the ASan runtime to enable the [Leak Sanitizer](https://clang.llvm.org/docs/LeakSanitizer.html) (LSan). Unfortunately, for heterogeneous applications, this default will result in significant output from the leak sanitizer when the application exits due to allocations made by the language runtime which are not considered to be leaks. This output can be avoided by adding `detect_leaks=0` to the `ASAN_OPTIONS`, or alternatively by producing an LSan suppression file (syntax described [here](https://github.com/google/sanitizers/wiki/AddressSanitizerLeakSanitizer)) and activating it with environment variable `LSAN_OPTIONS=suppressions=/path/to/suppression/file`. When using a suppression file, a suppression report is printed by default. The suppression report can be disabled by using the `LSAN_OPTIONS` flag `print_suppressions=0`.
## Runtime overhead
@@ -200,7 +186,7 @@ or
currently may include one or two surprising CPU side tracebacks mentioning :`hostcall`". This is due to how `malloc` and `free` are implemented for GPU code and these call stacks can be ignored.
## Running ASan with `rocgdb`
### Running with `rocgdb`
`rocgdb` can be used to further investigate ASan detected errors, with some preparation.
@@ -252,7 +238,7 @@ $ rocgdb <path to application>
(gdb) c
```
## Using ASan with a short HIP application
### Using ASan with a short HIP application
Consider the following simple and short demo of using the Address Sanitizer with a HIP application:
@@ -416,7 +402,7 @@ Shadow byte legend (one shadow byte represents 8 application bytes):
==2817==ABORTING
```
## Known issues with using GPU sanitizer
### Known issues with using GPU sanitizer
* Red zones must have limited size. It is possible for an invalid access to completely miss a red zone and not be detected.
@@ -424,8 +410,4 @@ Shadow byte legend (one shadow byte represents 8 application bytes):
* Lack of detection on the GPU might also be due to the implementation not instrumenting accesses to all GPU specific address spaces. For example, in the current implementation accesses to "private" or "stack" variables on the GPU are not instrumented, and accesses to HIP shared variables (also known as "local data store" or "LDS") are also not instrumented.
* It can also be the case that a memory fault is reported for an invalid address even with the instrumentation. This is usually caused by the invalid address being so wild that its shadow address is outside any memory region, and the fault actually occurs on the access to the shadow address. It is also possible to hit a memory fault for the `NULL` pointer. While address 0 does have a shadow location, it is not poisoned by the runtime.
* There is currently a bug which can result in memory faults being reported when running instrumented device code which makes use of `malloc`, `free`, `new`, or `delete`.
* There is currently a bug which can result in undefined symbols being reported at compile time when instrumented device code makes use of `new` and `delete`.
* It can also be the case that a memory fault is hit for an invalid address even with the instrumentation. This is usually caused by the invalid address being so wild that its shadow address is outside any memory region, and the fault actually occurs on the access to the shadow address. It is also possible to hit a memory fault for the `NULL` pointer. While address 0 does have a shadow location, it is not poisoned by the runtime.

View File

@@ -5,9 +5,25 @@
# https://www.sphinx-doc.org/en/master/usage/configuration.html
import shutil
import jinja2
import os
# Environment to process Jinja templates.
jinja_env = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
# Jinja templates to render out.
templates = []
# Render templates and output files without the last extension.
# For example: 'install.md.jinja' becomes 'install.md'.
for template in templates:
rendered = jinja_env.get_template(template).render()
with open(os.path.splitext(template)[0], 'w') as file:
file.write(rendered)
shutil.copy2('../RELEASE.md','./about/release-notes.md')
# Keep capitalization due to similar linking on GitHub's markdown preview.
shutil.copy2("../RELEASE.md", "./about/release-notes.md")
shutil.copy2('../CHANGELOG.md','./about/changelog.md')
latex_engine = "xelatex"
latex_elements = {
@@ -21,71 +37,58 @@ latex_elements = {
# configurations for PDF output by Read the Docs
project = "ROCm Documentation"
author = "Advanced Micro Devices, Inc."
copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
version = "6.1.5"
release = "6.1.5"
copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
version = "6.1.2"
release = "6.1.2"
setting_all_article_info = True
all_article_info_os = ["linux", "windows"]
all_article_info_author = ""
# pages with specific settings
article_pages = [
{"file": "about/release-notes", "os": ["linux"], "date": "2025-03-04"},
{"file": "compatibility/compatibility-matrix", "os": ["linux"]},
{"file": "how-to/deep-learning-rocm", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/train-a-model", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/deploy-your-model", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/hugging-face-models", "os": ["linux"]},
{"file": "how-to/rocm-for-hpc/index", "os": ["linux"]},
{"file": "how-to/llm-fine-tuning-optimization/index", "os": ["linux"]},
{"file": "how-to/llm-fine-tuning-optimization/overview", "os": ["linux"]},
{
"file": "how-to/llm-fine-tuning-optimization/fine-tuning-and-inference",
"os": ["linux"],
"file":"about/release-notes",
"os":["linux", "windows"],
"date":"2024-06-04"
},
{
"file": "how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference",
"os": ["linux"],
"file":"about/changelog",
"os":["linux", "windows"],
"date":"2024-06-04"
},
{
"file": "how-to/llm-fine-tuning-optimization/multi-gpu-fine-tuning-and-inference",
"os": ["linux"],
},
{
"file": "how-to/llm-fine-tuning-optimization/llm-inference-frameworks",
"os": ["linux"],
},
{
"file": "how-to/llm-fine-tuning-optimization/model-acceleration-libraries",
"os": ["linux"],
},
{"file": "how-to/llm-fine-tuning-optimization/model-quantization", "os": ["linux"]},
{
"file": "how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel",
"os": ["linux"],
},
{
"file": "how-to/llm-fine-tuning-optimization/optimizing-triton-kernel",
"os": ["linux"],
},
{
"file": "how-to/llm-fine-tuning-optimization/profiling-and-debugging",
"os": ["linux"],
},
{"file": "how-to/system-optimization/index", "os": ["linux"]},
{"file": "how-to/system-optimization/mi300x", "os": ["linux"]},
{"file": "how-to/system-optimization/mi200", "os": ["linux"]},
{"file": "how-to/system-optimization/mi100", "os": ["linux"]},
{"file": "how-to/system-optimization/w6000-v620", "os": ["linux"]},
{"file": "how-to/tuning-guides/mi300x/index", "os": ["linux"]},
{"file": "how-to/tuning-guides/mi300x/system", "os": ["linux"]},
{"file": "how-to/tuning-guides/mi300x/workload", "os": ["linux"]},
{"file": "how-to/system-debugging", "os": ["linux"]},
{"file": "how-to/gpu-enabled-mpi", "os": ["linux"]},
{"file":"install/windows/install-quick", "os":["windows"]},
{"file":"install/linux/install-quick", "os":["linux"]},
{"file":"install/linux/install", "os":["linux"]},
{"file":"install/linux/install-options", "os":["linux"]},
{"file":"install/linux/prerequisites", "os":["linux"]},
{"file":"install/docker", "os":["linux"]},
{"file":"install/magma-install", "os":["linux"]},
{"file":"install/pytorch-install", "os":["linux"]},
{"file":"install/tensorflow-install", "os":["linux"]},
{"file":"install/windows/install", "os":["windows"]},
{"file":"install/windows/prerequisites", "os":["windows"]},
{"file":"install/windows/cli/index", "os":["windows"]},
{"file":"install/windows/gui/index", "os":["windows"]},
{"file":"about/compatibility/docker-image-support-matrix", "os":["linux"]},
{"file":"about/compatibility/user-kernel-space-compat-matrix", "os":["linux"]},
{"file":"reference/library-index", "os":["linux"]},
{"file":"how-to/deep-learning-rocm", "os":["linux"]},
{"file":"how-to/gpu-enabled-mpi", "os":["linux"]},
{"file":"how-to/system-debugging", "os":["linux"]},
{"file":"how-to/tuning-guides", "os":["linux", "windows"]},
{"file":"rocm-a-z", "os":["linux", "windows"]},
]
exclude_patterns = ['temp']
external_toc_path = "./sphinx/_toc.yml"
extensions = ["rocm_docs", "sphinx_reredirects"]
@@ -96,12 +99,14 @@ html_theme = "rocm_docs_theme"
html_theme_options = {"flavor": "rocm-docs-home"}
html_static_path = ["sphinx/static/css"]
html_css_files = ["rocm_custom.css", "rocm_rn.css"]
html_css_files = ["rocm_custom.css"]
html_title = "ROCm Documentation"
html_theme_options = {"link_main_doc": False}
html_theme_options = {
"link_main_doc": False
}
redirects = {"reference/openmp/openmp": "../../about/compatibility/openmp.html"}
numfig = False
redirects = {
"reference/openmp/openmp": "../../about/compatibility/openmp.html"
}

View File

@@ -12,7 +12,8 @@ There are four standard ways to provide feedback on this repository.
All contributions to ROCm documentation should arrive via the
[GitHub Flow](https://docs.github.com/en/get-started/quickstart/github-flow)
targeting the develop branch of the repository.
targeting the develop branch of the repository. If you are unable to contribute
via the GitHub Flow, feel free to email us at [rocm-feedback@amd.com](mailto:rocm-feedback@amd.com?subject=Documentation%20Feedback).
For more in-depth information on creating a pull request (PR), see
[Contributing](./contributing.md).
@@ -29,3 +30,7 @@ and follow along on via public announcements.
Issues on existing or absent documentation can be filed in
[GitHub Issues](https://github.com/ROCm/ROCm/issues).
## Email
Send other feedback or questions to [rocm-feedback@amd.com](mailto:rocm-feedback@amd.com?subject=Documentation%20Feedback).

View File

Before

Width:  |  Height:  |  Size: 83 KiB

After

Width:  |  Height:  |  Size: 83 KiB

View File

Before

Width:  |  Height:  |  Size: 44 KiB

After

Width:  |  Height:  |  Size: 44 KiB

View File

Before

Width:  |  Height:  |  Size: 112 KiB

After

Width:  |  Height:  |  Size: 112 KiB

View File

Before

Width:  |  Height:  |  Size: 188 KiB

After

Width:  |  Height:  |  Size: 188 KiB

View File

Before

Width:  |  Height:  |  Size: 138 KiB

After

Width:  |  Height:  |  Size: 138 KiB

View File

Before

Width:  |  Height:  |  Size: 62 KiB

After

Width:  |  Height:  |  Size: 62 KiB

View File

Before

Width:  |  Height:  |  Size: 27 KiB

After

Width:  |  Height:  |  Size: 27 KiB

View File

Before

Width:  |  Height:  |  Size: 86 KiB

After

Width:  |  Height:  |  Size: 86 KiB

View File

Before

Width:  |  Height:  |  Size: 49 KiB

After

Width:  |  Height:  |  Size: 49 KiB

View File

Before

Width:  |  Height:  |  Size: 45 KiB

After

Width:  |  Height:  |  Size: 45 KiB

View File

Before

Width:  |  Height:  |  Size: 288 KiB

After

Width:  |  Height:  |  Size: 288 KiB

View File

Before

Width:  |  Height:  |  Size: 153 KiB

After

Width:  |  Height:  |  Size: 153 KiB

View File

Before

Width:  |  Height:  |  Size: 219 KiB

After

Width:  |  Height:  |  Size: 219 KiB

View File

Before

Width:  |  Height:  |  Size: 80 KiB

After

Width:  |  Height:  |  Size: 80 KiB

View File

Before

Width:  |  Height:  |  Size: 73 KiB

After

Width:  |  Height:  |  Size: 73 KiB

View File

Before

Width:  |  Height:  |  Size: 28 KiB

After

Width:  |  Height:  |  Size: 28 KiB

View File

Before

Width:  |  Height:  |  Size: 43 KiB

After

Width:  |  Height:  |  Size: 43 KiB

View File

Before

Width:  |  Height:  |  Size: 25 KiB

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 98 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 187 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 92 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 124 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 244 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 310 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 342 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 200 KiB

View File

@@ -8,18 +8,48 @@ Installing deep learning frameworks for ROCm
ROCm provides a comprehensive ecosystem for deep learning development, including
:ref:`libraries <artificial-intelligence-apis>` for optimized deep learning operations and ROCm-aware versions of popular
deep learning frameworks and libraries such as PyTorch, TensorFlow, and JAX. ROCm works closely with these
deep learning frameworks and libraries such as PyTorch, TensorFlow, JAX, and MAGMA. ROCm works closely with these
frameworks to ensure that framework-specific optimizations take advantage of AMD accelerator and GPU architectures.
The following guides cover installation processes for ROCm-aware deep learning frameworks.
* :doc:`PyTorch for ROCm <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`
* :doc:`TensorFlow for ROCm <rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`
* :doc:`JAX for ROCm <rocm-install-on-linux:how-to/3rd-party/jax-install>`
.. grid::
.. grid-item::
:columns: 3
:doc:`PyTorch for ROCm <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`
.. grid-item::
:columns: 3
:doc:`TensorFlow for ROCm <rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`
.. grid-item::
:columns: 3
.. grid-item::
:columns: 3
.. grid-item::
:columns: 3
:doc:`JAX for ROCm <rocm-install-on-linux:how-to/3rd-party/jax-install>`
.. grid-item::
:columns: 3
:doc:`MAGMA for ROCm <rocm-install-on-linux:how-to/3rd-party/magma-install>`
.. grid-item::
:columns: 3
.. grid-item::
:columns: 3
The following chart steps through typical installation workflows for installing deep learning frameworks for ROCm.
.. image:: ../data/how-to/framework_install_2024_07_04.png
.. image:: ../data/how-to/framework_install_2024_05_23.png
:alt: Flowchart for installing ROCm-aware machine learning frameworks
:align: center
@@ -35,4 +65,4 @@ through the following guides.
* :doc:`rocm-for-ai/index`
* :doc:`llm-fine-tuning-optimization/index`
* :doc:`fine-tuning-llms/index`

View File

@@ -28,9 +28,18 @@ graphs, tensor parallel multi-GPU, GPTQ, AWQ, and token speculation.
Installing vLLM
---------------
1. To install vLLM, run the following commands.
.. code-block:: shell
# Install from source
git clone https://github.com/ROCm/vllm.git
cd vllm
PYTORCH_ROCM_ARCH=gfx942 python setup.py install #MI300 series
.. _fine-tuning-llms-vllm-rocm-docker-image:
1. Run the following commands to build a Docker image ``vllm-rocm``.
2. Run the following commands to build a Docker image ``vllm-rocm``.
.. code-block:: shell
@@ -43,7 +52,7 @@ Installing vLLM
.. tab-item:: vLLM on a single-accelerator system
:sync: single
2. To use vLLM as an API server to serve reference requests, first start a container using the :ref:`vllm-rocm
3. To use vLLM as an API server to serve reference requests, first start a container using the :ref:`vllm-rocm
Docker image <fine-tuning-llms-vllm-rocm-docker-image>`.
.. code-block:: shell
@@ -60,7 +69,7 @@ Installing vLLM
vllm-rocm \
bash
3. Inside the container, start the API server to run on a single accelerator on port 8000 using the following command.
4. Inside the container, start the API server to run on a single accelerator on port 8000 using the following command.
.. code-block:: shell
@@ -68,61 +77,10 @@ Installing vLLM
The following log message is displayed in your command line indicates that the server is listening for requests.
.. image:: ../../data/how-to/llm-fine-tuning-optimization/vllm-single-gpu-log.png
.. image:: ../../data/how-to/fine-tuning-llms/vllm-single-gpu-log.png
:alt: vLLM API server log message
:align: center
4. To test, send it a curl request containing a prompt.
.. code-block:: shell
curl http://localhost:8000/generate -H "Content-Type: application/json" -d '{"prompt": "What is AMD Instinct?", "max_tokens": 80, "temperature": 0.0 }'
You should receive a response like the following.
.. code-block:: text
{"text":["What is AMD Instinct?\nAmd Instinct is a brand new line of high-performance computing (HPC) processors from Advanced Micro Devices (AMD). These processors are designed to deliver unparalleled performance for HPC workloads, including scientific simulations, data analytics, and machine learning.\nThe Instinct lineup includes a range of processors, from the entry-level Inst"]}
.. tab-item:: vLLM on a multi-accelerator system
:sync: multi
2. To use vLLM as an API server to serve reference requests, first start a container using the :ref:`vllm-rocm
Docker image <fine-tuning-llms-vllm-rocm-docker-image>`.
.. code-block:: shell
docker run -it \
--network=host \
--group-add=video \
--ipc=host \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--device /dev/kfd \
--device /dev/dri \
-v <path/to/model>:/app/model \
vllm-rocm \
bash
3. To run API server on multiple GPUs, use the ``-tp`` or ``--tensor-parallel-size`` parameter. For example, to use two
GPUs, start the API server using the following command.
.. code-block:: shell
python -m vllm.entrypoints.api_server --model /app/model --dtype float16 -tp 2 --port 8000 &
4. To run multiple instances of API Servers, specify different ports for each server, and use ``ROCR_VISIBLE_DEVICES`` to
isolate each instance to a different accelerator.
For example, to run two API servers, one on port 8000 using GPU 0 and 1, one on port 8001 using GPU 2 and 3, use a
a command like the following.
.. code-block:: shell
ROCR_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.api_server --model /data/llama-2-7b-chat-hf --dtype float16 tp 2 --port 8000 &
ROCR_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.api_server --model /data/llama-2-7b-chat-hf --dtype float16 tp 2--port 8001 &
5. To test, send it a curl request containing a prompt.
.. code-block:: shell
@@ -134,8 +92,57 @@ Installing vLLM
.. code-block:: text
{"text":["What is AMD Instinct?\nAmd Instinct is a brand new line of high-performance computing (HPC) processors from Advanced Micro Devices (AMD). These processors are designed to deliver unparalleled performance for HPC workloads, including scientific simulations, data analytics, and machine learning.\nThe Instinct lineup includes a range of processors, from the entry-level Inst"]}
.. tab-item:: vLLM on a multi-accelerator system
:sync: multi
Refer to :ref:`mi300x-vllm-optimization` for performance optimization tips.
3. To use vLLM as an API server to serve reference requests, first start a container using the :ref:`vllm-rocm
Docker image <fine-tuning-llms-vllm-rocm-docker-image>`.
.. code-block:: shell
docker run -it \
--network=host \
--group-add=video \
--ipc=host \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--device /dev/kfd \
--device /dev/dri \
-v <path/to/model>:/app/model \
vllm-rocm \
bash
4. To run API server on multiple GPUs, use the ``-tp`` or ``--tensor-parallel-size`` parameter. For example, to use two
GPUs, start the API server using the following command.
.. code-block:: shell
python -m vllm.entrypoints.api_server --model /app/model --dtype float16 -tp 2 --port 8000 &
5. To run multiple instances of API Servers, specify different ports for each server, and use ``ROCR_VISIBLE_DEVICES`` to
isolate each instance to a different accelerator.
For example, to run two API servers, one on port 8000 using GPU 0 and 1, one on port 8001 using GPU 2 and 3, use a
a command like the following.
.. code-block:: shell
ROCR_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.api_server --model /data/llama-2-7b-chat-hf --dtype float16 tp 2 --port 8000 &
ROCR_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.api_server --model /data/llama-2-7b-chat-hf --dtype float16 tp 2--port 8001 &
6. To test, send it a curl request containing a prompt.
.. code-block:: shell
curl http://localhost:8000/generate -H "Content-Type: application/json" -d '{"prompt": "What is AMD Instinct?", "max_tokens": 80, "temperature": 0.0 }'
You should receive a response like the following.
.. code-block:: text
{"text":["What is AMD Instinct?\nAmd Instinct is a brand new line of high-performance computing (HPC) processors from Advanced Micro Devices (AMD). These processors are designed to deliver unparalleled performance for HPC workloads, including scientific simulations, data analytics, and machine learning.\nThe Instinct lineup includes a range of processors, from the entry-level Inst"]}
.. _fine-tuning-llms-tgi:
@@ -156,29 +163,27 @@ speculation.
Install TGI
-----------
1. Launch the TGI Docker container in the host machine.
1. To install the TGI Docker image, run the following commands.
.. code-block:: shell
docker run --name tgi --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
--device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 256g
--net host -v $PWD:/data
--entrypoint "/bin/bash"
--env HUGGINGFACE_HUB_CACHE=/data
ghcr.io/huggingface/text-generation-inference:latest-rocm
# Install from Dockerfile
git clone https://github.com/huggingface/text-generation-inference.git -b mi300-compat
cd text-generation-inference
docker build . -f Dockerfile.rocm
.. tab-set::
.. tab-item:: TGI on a single-accelerator system
:sync: single
2. Inside the container, launch a model using TGI server on a single accelerator.
2. Launch a model using TGI server on a single accelerator.
.. code-block:: shell
export ROCM_USE_FLASH_ATTN_V2_TRITON=True
text-generation-launcher --model-id NousResearch/Meta-Llama-3-70B --dtype float16 --port 8000 &
3. To test, send it a curl request containing a prompt.
.. code-block:: shell
@@ -186,26 +191,26 @@ Install TGI
curl http://localhost:8000/generate_stream -X POST -d '{"inputs":"What is AMD Instinct?","parameters":{"max_new_tokens":20}}' -H 'Content-Type: application/json'
You should receive a response like the following.
.. code-block:: shell
data:{"index":20,"token":{"id":304,"text":" in","logprob":-1.2822266,"special":false},"generated_text":" AMD Instinct is a new family of data center GPUs designed to accelerate the most demanding workloads in","details":null}
.. tab-item:: TGI on a multi-accelerator system
2. Inside the container, launch a model using TGI server on multiple accelerators (4 in this case).
2. Launch a model using TGI server on multiple accelerators (4 in this case).
.. code-block:: shell
export ROCM_USE_FLASH_ATTN_V2_TRITON=True
text-generation-launcher --model-id NousResearch/Meta-Llama-3-8B --dtype float16 --port 8000 --num-shard 4 &
3. To test, send it a curl request containing a prompt.
.. code-block:: shell
curl http://localhost:8000/generate_stream -X POST -d '{"inputs":"What is AMD Instinct?","parameters":{"max_new_tokens":20}}' -H 'Content-Type: application/json'
You should receive a response like the following.
.. code-block:: shell

View File

@@ -8,8 +8,6 @@ Model acceleration libraries
This section discusses model acceleration techniques and libraries to improve memory efficiency and performance.
.. _acceleration-flash-attention:
Flash Attention 2
=================
@@ -20,7 +18,7 @@ Attention (GQA), and Multi-Query Attention (MQA). This reduction in memory movem
time-to-first-token (TTFT) latency for large batch sizes and long prompt sequences, thereby enhancing overall
performance.
.. image:: ../../data/how-to/llm-fine-tuning-optimization/attention-module.png
.. image:: ../../data/how-to/fine-tuning-llms/attention-module.png
:alt: Attention module of a large language module utilizing tiling
:align: center
@@ -245,7 +243,7 @@ page describes the options.
Validator,ROCBLAS_VERSION,4.1.0-cefa4a9b-dirty
GemmTunableOp_float_TN,tn_200_100_20,Gemm_Rocblas_32323,0.00669595
.. image:: ../../data/how-to/llm-fine-tuning-optimization/tunableop.png
.. image:: ../../data/how-to/fine-tuning-llms/tunableop.png
:alt: GEMM and TunableOp
:align: center

View File

@@ -154,12 +154,11 @@ kernels by configuring the ``exllama_config`` parameter as the following.
.. code-block:: python
from transformers import AutoModelForCausalLM, GPTQConfig
#pretrained_model_dir = "meta-llama/Llama-2-7b"
base_model_name = "NousResearch/Llama-2-7b-hf"
gptq_config = GPTQConfig(bits=4, dataset="c4", exllama_config={"version":2})
pretrained_model_dir = "meta-llama/Llama-2-7b"
gptq_config = GPTQConfig(bits=4, exllama_config={"version":2})
quantized_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
device_map="auto",
base_model_name,
device_map="auto",
quantization_config=gptq_config)
bitsandbytes

View File

@@ -0,0 +1,388 @@
.. meta::
:description: How to fine-tune LLMs with ROCm
:keywords: ROCm, LLM, fine-tuning, usage, tutorial, Triton, kernel, performance, optimization
*************************
Optimizing Triton kernels
*************************
This section introduces the general steps for `Triton <https://openai.com/index/triton/>`_ kernel optimization. Broadly,
Triton kernel optimization is similar to HIP and CUDA kernel optimization.
.. _fine-tuning-llms-triton-memory-access-efficiency:
Memory access efficiency
========================
The accelerator or GPU contains global memory, local data share (LDS), and registers. Global memory has high access
latency, but is large. LDS access has much lower latency, but is smaller. Register access is the fastest yet smallest
among the three.
So, the data in global memory should be loaded and stored as few times as possible. If different threads in a block
need to access the same data, these data should be first transferred from global memory to LDS, then accessed by
different threads in a workgroup.
.. _fine-tuning-llms-triton-hardware-resource-utilization:
Hardware resource utilization
=============================
Each accelerator or GPU has multiple Compute Units (CUs) and various CUs do computation in parallel. So, how many CUs
can a compute kernel can allocate its task to? For the :doc:`AMD MI300X accelerator <../../reference/gpu-arch-specs>`, the
grid should have at least 1024 thread blocks or workgroups.
.. figure:: ../../data/how-to/fine-tuning-llms/compute-unit.png
Schematic representation of a CU in the CDNA2 or CDNA3 architecture.
To increase hardware utilization and maximize parallelism, it is necessary to design algorithms that can exploit more
parallelism. One approach to achieving this is by using larger split-K techniques for General Matrix Multiply (GEMM)
operations, which can further distribute the computation across more CUs, thereby enhancing performance.
.. tip::
You can query hardware resources with the command ``rocminfo`` (in the ``/opt/rocm/bin`` directory). For instance,
query the number of CUs, number of SIMD, and wavefront size using the following commands.
.. code-block:: shell
rocminfo | grep "Compute Unit"
rocminfo | grep "SIMD"
rocminfo | grep "Wavefront Size"
On an MI300X device, there are 304 CUs, 4 SIMD per CU, and the wavefront size (warp size) is 64. See :doc:`Hardware
specifications <../../reference/gpu-arch-specs>` for a full list of AMD accelerators and GPUs.
.. _fine-tuning-llms-triton-ir-analysis:
IR analysis
===========
In Triton, there are several layouts including *blocked*, *shared*, *sliced*, and *MFMA*.
From the Triton GPU IR (intermediate representation), you can know in which memory each computation is
performed. The following is a snippet of IR from the Flash Attention decode ``int4`` key-value program. It is to
de-quantize the ``int4`` key-value from the ``int4`` data type to ``fp16``.
.. code-block::
%190 = tt.load %189 {cache = 1 : i32, evict = 1 : i32, isVolatile =
false} : tensor<1x64xi32, #blocked6> loc(#loc159)
%266 = arith.andi %190, %cst_28 : tensor<1x64xi32, #blocked6>
loc(#loc250)
%267 = arith.trunci %266 : tensor<1x64xi32, #blocked6> to
tensor<1x64xi16, #blocked6> loc(#loc251)
%268 = tt.bitcast %267 : tensor<1x64xi16, #blocked6> -> tensor<1x64xf16,
#blocked6> loc(#loc252)
%269 = triton_gpu.convert_layout %268 : (tensor<1x64xf16, #blocked6>) ->
tensor<1x64xf16, #shared1> loc(#loc252)
%270 = tt.trans %269 : (tensor<1x64xf16, #shared1>) -> tensor<64x1xf16,
#shared2> loc(#loc194)
%276 = triton_gpu.convert_layout %270 : (tensor<64x1xf16, #shared2>) ->
tensor<64x1xf16, #blocked5> loc(#loc254)
%293 = arith.mulf %276, %cst_30 : tensor<64x1xf16, #blocked5>
loc(#loc254)
%295 = arith.mulf %292, %294 : tensor<64x32xf16, #blocked5> loc(#loc264)
%297 = arith.addf %295, %296 : tensor<64x32xf16, #blocked5> loc(#loc255)
%298 = triton_gpu.convert_layout %297 : (tensor<64x32xf16, #blocked5>)
-> tensor<64x32xf16, #shared1> loc(#loc255)
%299 = tt.trans %298 : (tensor<64x32xf16, #shared1>) ->
tensor<32x64xf16, #shared2> loc(#loc196)
%300 = triton_gpu.convert_layout %299 : (tensor<32x64xf16, #shared2>) ->
tensor<32x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth
= 4}>> loc(#loc197)
From the IR, you can see ``i32`` data is loaded from global memory to registers. With a few element-wise operations in
registers, then it is stored in shared memory for the transpose operation, which needs data movement across different
threads. With the transpose done, it is loaded from LDS to register again, and with a few more element-wise operations,
they are stored in LDS again. The last step is to load from LDS to registers and convert to the dot-operand layout.
From the IR, you can see that it uses the LDS twice: one for the transpose, and the other to convert the blocked layout
to a dot-operand layout.
Assembly analysis
=================
In the ISA, ensure ``global_load_dwordx4`` is used, especially when the
load happens in a loop.
In most cases, the LDS load and store should use ``_b128`` as well to
minimize the number of LDS access instructions. Note that upstream (or backend) might not have ``_b128`` LDS read/write,
so it uses ``_b64``. For most cases, no matter if you use fork or upstream,
the LDS access should have ``_b64`` vector width.
The AMD ISA has the ``s_waitcnt`` instruction to synchronize the dependency
of memory access and computations. The ``s_waitcnt`` instruction can
have two signals, typically in the context of Triton:
* ``lgkmcnt(n):`` `lgkm` stands for LDS, GDS, Constant and Message.
In this context, it is often related to LDS access. The number ``n`` here means the number of such accesses that can
be left out to continue. For example, 0 means all ``lgkm`` access must finish before continuing, and 1 means only 1
``lgkm`` access can be still running asynchronously before proceeding.
* ``vmcnt(n):`` `vm` means vector memory.
This happens when vector memory is accessed, for example, when global load moves from global memory to vector memory.
Again, the number ``n`` here means the number of accesses that can be left out to continue.
Generally recommended guidelines are as follows.
* Vectorize memory access as much as possible.
* Ensure synchronization is done efficiently.
* Overlap of instructions to hide latency, but it requires thoughtful
analysis of the algorithms.
* If you find inefficiencies, you can trace it back to LLVM IR, TTGIR
and even TTIR to see where the problem comes from. If you find it
during compiler optimization, activate the MLIR dump and check which
optimization pass caused the problem.
.. _fine-tuning-llms-triton-kernel-occupancy:
Kernel occupancy
================
1. Get the VGPR count, search for ``.vgpr_count`` in the ISA (for example, ``N``).
2. Get the allocated LDS following the steps (for example, L for the kernel).
a. ``export MLIR_ENABLE_DUMP=1``
b. ``rm -rf ~/.triton/cache``
c. ``python kernel.py | | grep "triton_gpu.shared = " | tail -n 1``
d. You should see something like ``triton_gpu.shared = 65536``, indicating 65536 bytes of LDS are allocated for the
kernel.
3. Get number of waves per workgroup using the following steps (for example, ``nW``).
a. ``export MLIR_ENABLE_DUMP=1``
b. ``rm -rf ~/.triton/cache``
c. ``python kernel.py | | grep "triton_gpu.num-warps " | tail -n 1``
d. You should see something like ``“triton_gpu.num-warps" = 8``, indicating 8 waves per workgroup.
4. Compute occupancy limited by VGPR based on N according to the following table. For example, waves per EU as
``occ_vgpr``.
.. _fine-tuning-llms-occupancy-vgpr-table:
.. figure:: ../../data/how-to/fine-tuning-llms/occupancy-vgpr.png
:alt: Occupancy related to VGPR usage in an Instinct MI300X accelerator.
:align: center
5. Compute occupancy limited by LDS based on L by: ``occ_lds = floor(65536 / L)``.
6. Then the occupancy is ``occ = min(floor(occ_vgpr * 4 / nW), occ_lds) * nW / 4``
a. ``occ_vgpr \* 4`` gives the total number of waves on all 4 execution units (SIMDs)
per CU.
b. ``floor(occ_vgpr * 4 / nW)`` gives the occupancy of workgroups per CU
regrading VGPR usage.
c. The true ``occ`` is the minimum of the two.
.. _fine-tuning-llms-triton-kernel-configs-env-vars:
Auto-tunable kernel configurations and environment variables
============================================================
This section relates to the amount of :ref:`memory access <fine-tuning-llms-triton-memory-access-efficiency>` and
computation assigned to each CU. It is related to the usage of LDS, registers and the scheduling of different tasks on
a CU.
The following is a list of kernel arguments used for tuning.
``num_stages=n``
Adjusts the number of pipeline stages for different types of kernels. On AMD accelerators, set ``num_stages``
according to the following rules:
* For kernels with a single GEMM, set to ``0``.
* For kernels with two GEMMs fused (Flash Attention, or any other kernel
that fuses 2 GEMMs), set to ``1``.
* For kernels that fuse a single GEMM with another non-GEMM operator
(for example ReLU activation), set to ``0``.
* For kernels that have no GEMMs, set to ``1``.
``waves_per_eu=n``
Helps to manage Vector General Purpose Registers (VGPR) usage to achieve desired occupancy levels. This argument
hints to the compiler to reduce VGPR to achieve ``n`` occupancy. See
:ref:`Kernel occupancy <fine-tuning-llms-triton-kernel-occupancy>` for more information about how to compute
occupancy.
This argument is useful if:
* The occupancy of the kernel is limited by VGPR usage.
* The current VGPR usage is only a few above a boundary in
:ref:`Occupancy related to VGPR usage in an Instinct MI300X accelerator <fine-tuning-llms-occupancy-vgpr-table>`.
For example, according to the table, the available VGPR is 512 per Execution Unit (EU), and VGPU is allocated at the
unit of 16. If the current VGPR usage is 170, the actual requested VGPR will be 176, so the
occupancy is only 2 waves per CU since :math:`176 \times 3 > 512`. So, if you set
``waves_per_eu`` to 3, the LLVM backend tries to bring VGPR usage down so
that it might fit 3 waves per EU.
``BLOCK_M``, ``BLOCK_N``, ``BLOCK_K``
Tile sizes to be tuned to balance the memory-to-computation ratio. You want tile sizes large enough to
maximize the efficiency of memory-to-computation ratio, but small enough to parallelize the greatest number of
workgroups at the grid level.
``matrix_instr_nonkdim``
Experimental feature for Flash Attention-like kernels that determines the size of the Matrix Fused Multiply-Add
(MFMA) instruction used.
- ``Matrix_instr_nonkdim = 16``: ``mfma_16x16`` is used.
- ``Matrix_instr_nonkdim = 32``: ``mfma_32x32`` is used.
For GEMM kernels on an AMD MI300X accelerator, ``mfma_16x16`` typically outperforms ``mfma_32x32``, even for large
tile/GEMM sizes.
The following is an environment variable used for tuning.
``OPTIMIZE_EPILOGUE``
Setting this variable to ``1`` can improve performance by removing the ``convert_layout`` operation in the epilogue.
It should be turned on (set to ``1``) in most cases. Setting ``OPTIMIZE_EPILOGUE=1`` stores the MFMA instruction
results in the MFMA layout directly; this comes at the cost of reduced global store efficiency, but the impact on
kernel execution time is usually minimal.
By default (``0``), the results of MFMA instruction are converted to blocked layout, which leads to ``global_store``
with maximum vector length, that is ``global_store_dwordx4``.
This is done implicitly with LDS as the intermediate buffer to achieve
data exchange between threads. Padding is used in LDS to avoid bank
conflicts. This usually leads to extra LDS usage, which might reduce
occupancy.
.. note::
This variable is not turned on by default because it only
works with ``tt.store`` but not ``tt.atomic_add``, which is used in split-k and
stream-k GEMM kernels. In the future, it might be enabled with
``tt.atomic_add`` and turned on by default.
See :ref:`IR analysis <fine-tuning-llms-triton-ir-analysis>`.
TorchInductor with Triton tuning knobs
===========================================
The following are suggestions for optimizing matrix multiplication (GEMM) and convolution (``conv``) operations in PyTorch
using ``inductor``, a part of the PyTorch compilation framework. The goal is to leverage Triton to achieve better
performance.
Learn more about TorchInductor environment variables and usage in
`PyTorch documentation <https://pytorch.org/docs/2.3/torch.compiler_inductor_profiling.html>`_.
To enable a ``gemm``/``conv`` lowering to Triton, it requires use of ``inductor``s ``max_autotune`` mode. This benchmarks a
static list of Triton configurations (``conv`` configurations for max auto-tune + ``matmul`` configurations for max
auto-tune) and uses the fastest for each shape. Note that the Triton is not used if regular :doc:`MIOpen <miopen:index>`
or :doc:`rocBLAS <rocblas:index>` is faster for a specific operation.
* Set ``torch._inductor.config.max_autotune = True`` or ``TORCHINDUCTOR_MAX_AUTOTUNE=1``.
* Or, for more fine-grained control:
``torch._inductor.config.max_autotune.pointwise = True``
To enable tuning for ``pointwise``/``reduction`` ops.
``torch._inductor.config.max_autotune_gemm = True``
To enable tuning or lowering of ``mm``/``conv``\s.
``torch._inductor.max_autotune_gemm_backends/TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS``
To select the candidate backends for ``mm`` auto-tuning. Defaults to
``TRITON,ATEN,NV``. This also includes the ``CUTLASS`` tuning option. Limiting this to
``TRITON`` might improve performance by enabling more fused ``mm`` kernels
instead of going to rocBLAS.
* For ``mm`` tuning, tuning ``coordinate_descent`` might improve performance.
``torch._inductor.config.coordinate_descent_tuning = True`` or ``TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1``
* Inference can see large improvements on AMD GPUs by utilizing
``torch._inductor.config.freezing=True`` or the ``TORCHINDUCTOR_FREEZING=1`` variable, which
in-lines weights as constants and enables constant folding optimizations.
* Enabling ``inductor``s cpp_wrapper might improve overhead. This generates
C++ code which launches Triton binaries directly with
``hipModuleLaunchKernel`` and relies on `hipification`.
* For NHWC convolutions workloads
``torch._inductor.config.layout_optimization=True`` or ``TORCHINDUCTOR_LAYOUT_OPTIMIZATION=``
can help be enforcing channels_last format throughout the graph avoiding
any additional transposes added by ``inductor``. Note that
``PYTORCH_MIOPEN_SUGGEST_NHWC=1`` is recommended if using this.
* Extracting the Triton kernel ``TORCH_COMPILE_DEBUG`` creates a
``torch_compile_debug/`` directory at current path, in the ``output_code.py``
the code-strings for the Triton kernels that are defined. Manual work is
then required to strip out the kernel and create kernel
compilation and launch via Triton.
* For advanced ``matmul`` or ``conv`` configuration tuning, the ``inductor-gemm-tuner`` can
help. This implements the Triton ``conv``/``mm`` implementations used upstream
and allows specification of inputs and configuration tuning search space if new
tunings are found that can be added to the auto-tune list.
Other guidelines
================
* Performance-critical HIP provides an environment variable, ``export HIP_FORCE_DEV_KERNARG=1``,
that can put HIP kernel arguments directly to
device memory to reduce the latency of accessing kernel arguments. It
can reduce 2 to 3 μs for some kernels. Setting this variable for the FA
decode containing ``splitK`` and reduced kernels can reduce the total time
by around 6 μs in the benchmark test.
* Set the clock to deterministic. Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed to
1900MHz instead of the default 2100MHz. This can reduce the chance of clock speed decrease due to chip high temperature
by setting a lower cap. You can restore this setting to its default value with ``rocm-smi -r``.
* Set Non-Uniform Memory Access (NUMA) auto-balance. Run the command ``cat /proc/sys/kernel/numa_balancing`` to check the
current setting. An output of ``0`` indicates this setting is available. If output is ``1``, run the command
``sudo sh -c \\'echo 0 > /proc/sys/kernel/numa_balancing`` to set this.
For these settings, the ``env_check.sh`` script automates the setting, resetting, and checking of the such
environments. Find the script at `<https://github.com/ROCm/triton/blob/rocm_env/scripts/amd/env_check.sh>`__.
.. _fine-tuning-llms-triton-tunableop:
TunableOp
---------
`TunableOp <https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/cuda/tunable/README.md>`_
is a feature used to define and optimize kernels that can have tunable parameters. This is useful in
optimizing the performance of custom kernels by exploring different parameter configurations to find the most efficient
setup. See more about PyTorch TunableOp :ref:`Model acceleration libraries <fine-tuning-llms-pytorch-tunableop>`.
You can easily manipulate the behavior TunableOp through environment variables, though you could use the C++ interface
``at::cuda::tunable::getTuningContext()``. A Python interface to the ``TuningContext`` does not yet exist.
The default value is ``0``, which means only 1 iteration is attempted. Remember: theres an overhead to tuning. To try
and minimize the overhead, only a limited number of iterations of a given operation are attempted. If you set this to
``10``, each solution for a given operation can run as many iterations as possible within 10ms. There is a hard-coded
upper limit of 100 iterations attempted per solution. This is a tuning parameter; if you want the tunings to be chosen
based on an average over multiple iterations, increase the allowed tuning duration.

View File

@@ -6,7 +6,7 @@
# Optimizing with Composable Kernel
The AMD ROCm Composable Kernel (CK) library provides a programming model for writing performance-critical kernels for machine learning workloads. It generates a general-purpose kernel during the compilation phase through a C++ template, enabling developers to achieve operation fusions on different data precisions.
The AMD ROCm&trade; Composable Kernel (CK) library provides a programming model for writing performance-critical kernels for machine learning workloads. It generates a general-purpose kernel during the compilation phase through a C++ template, enabling developers to achieve operation fusions on different data precisions.
This article gives a high-level overview of CK General Matrix Multiplication (GEMM) kernel based on the design example of `03_gemm_bias_relu`. It also outlines the steps to construct the kernel and run it. Moreover, the article provides a detailed implementation of running SmoothQuant quantized INT8 models on AMD Instinct MI300X accelerators using CK.
@@ -32,7 +32,7 @@ The template parameters of the instance are grouped into four parameter types:
================
### Figure 2
================ -->
```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-template_parameters.jpg
```{figure} ../../data/how-to/fine-tuning-llms/ck-template_parameters.jpg
The template parameters of the selected GEMM kernel are classified into four groups. These template parameter groups should be defined properly before running the instance.
```
@@ -126,7 +126,7 @@ The row and column, and stride information of input matrices are also passed to
================
### Figure 3
================ -->
```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-kernel_launch.jpg
```{figure} ../../data/how-to/fine-tuning-llms/ck-kernel_launch.jpg
Templated kernel launching consists of kernel instantiation, making arguments by passing in actual application parameters, creating an invoker, and running the instance through the invoker.
```
@@ -155,7 +155,7 @@ The first operation in the process is to perform the multiplication of input mat
================
### Figure 4
================ -->
```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-operation_flow.jpg
```{figure} ../../data/how-to/fine-tuning-llms/ck-operation_flow.jpg
Operation flow.
```
@@ -171,7 +171,7 @@ Here, we use [DeviceBatchedGemmMultiD_Xdl](https://github.com/ROCm/composable_ke
================
### Figure 5
================ -->
```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-root_instance.jpg
```{figure} ../../data/how-to/fine-tuning-llms/ck-root_instance.jpg
Use the DeviceBatchedGemmMultiD_Xdl instance as a root.
```
@@ -421,7 +421,7 @@ Run `python setup.py install` to build and install the extension. It should look
================
### Figure 6
================ -->
```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-compilation.jpg
```{figure} ../../data/how-to/fine-tuning-llms/ck-compilation.jpg
Compilation and installation of the INT8 kernels.
```
@@ -433,7 +433,7 @@ The implementation architecture of running SmoothQuant models on MI300X GPUs is
================
### Figure 7
================ -->
```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-inference_flow.jpg
```{figure} ../../data/how-to/fine-tuning-llms/ck-inference_flow.jpg
The implementation architecture of running SmoothQuant models on AMD MI300X accelerators.
```
@@ -459,7 +459,7 @@ Figure 8 shows the performance comparisons between the original FP16 and the Smo
================
### Figure 8
================ -->
```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-comparisons.jpg
```{figure} ../../data/how-to/fine-tuning-llms/ck-comparisons.jpg
Performance comparisons between the original FP16 and the SmoothQuant-quantized INT8 models on a single MI300X accelerator.
```

View File

@@ -41,7 +41,7 @@ The weight update is as follows: :math:`W_{updated} = W + ΔW`.
If the weight matrix :math:`W` contains 7B parameters, then the weight update matrix :math:`ΔW` should also
contain 7B parameters. Therefore, the :math:`ΔW` calculation is computationally and memory intensive.
.. figure:: ../../data/how-to/llm-fine-tuning-optimization/weight-update.png
.. figure:: ../../data/how-to/fine-tuning-llms/weight-update.png
:alt: Weight update diagram
(a) Weight update in regular fine-tuning. (b) Weight update in LoRA where the product of matrix A (:math:`M\times K`)

View File

@@ -0,0 +1,217 @@
.. meta::
:description: How to fine-tune LLMs with ROCm
:keywords: ROCm, LLM, fine-tuning, usage, tutorial, profiling, debugging, performance, Triton
***********************
Profiling and debugging
***********************
This section discusses profiling and debugging tools and some of their common usage patterns with ROCm applications.
PyTorch Profiler
================
`PyTorch Profiler <https://pytorch.org/docs/stable/profiler.html>`_ can be invoked inside Python scripts, letting you
collect CPU and GPU performance metrics while the script is running. See the `PyTorch Profiler tutorial
<https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html>`_ for more information.
You can then visualize and view these metrics using an open-source profile visualization tool like
`Perfetto UI <https://ui.perfetto.dev>`_.
#. Use the following snippet to invoke PyTorch Profiler in your code.
.. code-block:: python
import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity
model = models.resnet18().cuda()
inputs = torch.randn(2000, 3, 224, 224).cuda()
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
with record_function("model_inference"):
model(inputs)
prof.export_chrome_trace("resnet18_profile.json")
#. Profile results in ``resnet18_profile.json`` can be viewed by the Perfetto visualization tool. Go to
`<https://ui.perfetto.dev>`__ and import the file. In your Perfetto visualization, you'll see that the upper section
shows transactions denoting the CPU activities that launch GPU kernels while the lower section shows the actual GPU
activities where it processes the ``resnet18`` inferences layer by layer.
.. figure:: ../../data/how-to/fine-tuning-llms/perfetto-trace.svg
Perfetto trace visualization example.
ROCm profiling tools
====================
Heterogenous systems, where programs run on both CPUs and GPUs, introduce additional complexities. Understanding the
critical path and kernel execution is all the more important; so, performance tuning is a necessary component in the
benchmarking process.
With AMD's profiling tools, developers are able to gain important insight into how efficiently their application is
using hardware resources and effectively diagnose potential bottlenecks contributing to poor performance. Developers
working with AMD Instinct accelerators have multiple tools depending on their specific profiling needs; these are:
* :ref:`ROCProfiler <fine-tuning-llms-profiling-rocprof>`
* :ref:`Omniperf <fine-tuning-llms-profiling-omniperf>`
* :ref:`Omnitrace <fine-tuning-llms-profiling-omnitrace>`
.. _fine-tuning-llms-profiling-rocprof:
ROCProfiler
-----------
:doc:`ROCProfiler <rocprofiler:index>` is primarily a low-level API for accessing and extracting GPU hardware performance
metrics, commonly called *performance counters*. These counters quantify the performance of the underlying architecture
showcasing which pieces of the computational pipeline and memory hierarchy are being utilized.
Your ROCm installation contains a script or executable command called ``rocprof`` which provides the ability to list all
available hardware counters for your specific accelerator or GPU, and run applications while collecting counters during
their execution.
This ``rocprof`` utility also depends on the :doc:`ROCTracer and ROC-TX libraries <roctracer:index>`, giving it the
ability to collect timeline traces of the accelerator software stack as well as user-annotated code regions.
.. note::
``rocprof`` is a CLI-only utility so input and output takes the format of ``.txt`` and CSV files. These
formats provide a raw view of the data and puts the onus on the user to parse and analyze. Therefore, ``rocprof``
gives the user full access and control of raw performance profiling data, but requires extra effort to analyze the
collected data.
.. _fine-tuning-llms-profiling-omniperf:
Omniperf
--------
`Omniperf <https://rocm.github.io/omniperf>`_ is a system performance profiler for high-performance computing (HPC) and
machine learning (ML) workloads using Instinct accelerators. Under the hood, Omniperf uses
:ref:`ROCProfiler <fine-tuning-llms-profiling-rocprof>` to collect hardware performance counters. The Omniperf tool performs
system profiling based on all approved hardware counters for Instinct
accelerator architectures. It provides high level performance analysis features including System Speed-of-Light, IP
block Speed-of-Light, Memory Chart Analysis, Roofline Analysis, Baseline Comparisons, and more.
Omniperf takes the guesswork out of profiling by removing the need to provide text input files with lists of counters
to collect and analyze raw CSV output files as is the case with ROC-profiler. Instead, Omniperf automates the collection
of all available hardware counters in one command and provides a graphical interface to help users understand and
analyze bottlenecks and stressors for their computational workloads on AMD Instinct accelerators.
.. note::
Omniperf collects hardware counters in multiple passes, and will therefore re-run the application during each pass
to collect different sets of metrics.
.. figure:: ../../data/how-to/fine-tuning-llms/omniperf-analysis.png
Omniperf memory chat analysis panel.
In brief, Omniperf provides details about hardware activity for a particular GPU kernel. It also supports both
a web-based GUI or command-line analyzer, depending on your preference.
.. _fine-tuning-llms-profiling-omnitrace:
Omnitrace
---------
`Omnitrace <https://rocm.github.io/omnitrace>`_ is a comprehensive profiling and tracing tool for parallel applications,
including HPC and ML packages, written in C, C++, Fortran, HIP, OpenCL, and Python which execute on the CPU or CPU and
GPU. It is capable of gathering the performance information of functions through any combination of binary
instrumentation, call-stack sampling, user-defined regions, and Python interpreter hooks.
Omnitrace supports interactive visualization of comprehensive traces in the web browser in addition to high-level
summary profiles with ``mean/min/max/stddev`` statistics. Beyond runtime
information, Omnitrace supports the collection of system-level metrics such as CPU frequency, GPU temperature, and GPU
utilization. Process and thread level metrics such as memory usage, page faults, context switches, and numerous other
hardware counters are also included.
.. tip::
When analyzing the performance of an application, it is best not to assume you know where the performance
bottlenecks are and why they are happening. Omnitrace is the ideal tool for characterizing where optimization would
have the greatest impact on the end-to-end execution of the application and to discover what else is happening on the
system during a performance bottleneck.
.. figure:: ../../data/how-to/fine-tuning-llms/omnitrace-timeline.png
Omnitrace timeline trace example.
For details usage and examples of using these tools, refer to the
`Introduction to profiling tools for AMD hardware <https://rocm.blogs.amd.com/software-tools-optimization/profilers/README.html>`_
developer blog.
Debugging with ROCm Debug Agent
===============================
ROCm Debug Agent (:doc:`ROCdebug-agent <rocr_debug_agent:index>`) is a library that can be loaded by the ROCm platform
runtime (:doc:`ROCr <rocr-runtime:index>`) to provide the following functionalities for all AMD accelerators and GPUs
supported by the ROCm Debugger API (:doc:`ROCdbgapi <rocdbgapi:index>`).
* Print the state of all AMD accelerator or GPU wavefronts that caused a queue error; for example, causing a memory
violation, executing an ``s_trap2``, or executing an illegal instruction.
* Print the state of all AMD accelerator or GPU wavefronts by sending a ``SIGQUIT`` signal to the process in question;
for example, by pressing ``Ctrl + \`` while the process is executing.
Debugging memory access faults
------------------------------
Identifying a faulting kernel is often enough to triage a memory access fault. To that end, the
`ROCm Debug Agent <https://github.com/ROCm/rocr_debug_agent/>`_ can trap a memory access fault and provide a dump of all
active wavefronts that caused the error as well as the name of the kernel. The
`AMD ROCm Debug Agent Library README <https://github.com/ROCm/rocr_debug_agent/blob/master/README.md>`_ provides full
instructions, but in brief:
* Compiling with ``-ggdb -O0`` is recommended but not required.
* ``HSA_TOOLS_LIB=/opt/rocm/lib/librocm-debug-agent.so.2 HSA_ENABLE_DEBUG=1 ./my_program``
When the debug agent traps the fault, it will produce an extremely
verbose output of all wavefront registers and memory content.
Importantly, it also prints something like:
.. code-block:: shell
Disassembly for function vector_add_assert_trap(int*, int*, int*):
code object:
file:////rocm-debug-agent/build/test/rocm-debug-agent-test#offset=14309&size=31336
loaded at: [0x7fd4f100c000-0x7fd4f100e070]
The kernel name and the code object file should be listed. In the
example above, the kernel name is ``vector_add_assert_trap``, but this might
also look like:
.. code-block:: shell
Disassembly for function memory:///path/to/codeobject#offset=1234&size=567:
In this case, it is an in-memory kernel that was generated at runtime.
Using the following environment variable, the debug agent will save all code objects to the current directory (use
``--save-code-objects=[DIR]`` to place them in another location). The code objects will be renamed from the URI format
with special characters replaced by ``_``.
.. code-block:: shell
ROCM_DEBUG_AGENT_OPTIONS="--all --save-code-objects"
Use the ``llvm-objdump`` command to disassemble the indicated in-memory
code object that has now been saved to disk. The name of the kernel is
often found inside the disassembled code object.
.. code-block:: shell
llvm-objdump --disassemble-all path/to/code-object.co
Consider turning off memory caching strategies both within the ROCm
stack and PyTorch where possible. This will give the debug agent the
best chance at finding the memory fault where it originates. Otherwise,
it could be masked by writing past the end of a cached block within a
larger allocation.
.. code-block:: shell
PYTORCH_NO_HIP_MEMORY_CACHING=1
HSA_DISABLE_FRAGMENT_ALLOCATOR=1

Some files were not shown because too many files have changed in this diff Show More