Compare commits

...

11 Commits

Author SHA1 Message Date
Adel Johar
e6d089c5fa Docs: remove system_debugging.md 2025-05-19 13:54:14 +02:00
Daniel Su
0d7846fbab Ex CI: enable rocPRIM sparse checkout (#4743) 2025-05-15 14:39:28 -04:00
Daniel Su
156917e15d Ex CI: set absolute cmakeSourceDir paths (#4741) 2025-05-14 11:03:57 -04:00
Daniel Su
d7a9280008 Ex CI: set cmakeSourceDir for all components that set cmakeBuildDir (#4738) 2025-05-13 17:15:54 -04:00
Daniel Su
c1825ba41c Ex CI: skip docker creation on gfx942 (#4735) 2025-05-13 17:05:02 -04:00
Peter Park
0a77e7b3a5 docs: Add system health check doc under ROCm for AI (#4736)
* add initial draft

* add to toc and install page

* update wording

* improve documentation structure

* resturcture and expand content

* add to training section

* add to conf.py article_pages

* Update docs/how-to/rocm-for-ai/includes/system-health-benchmarks.rst

Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>

* Update docs/how-to/rocm-for-ai/includes/system-health-benchmarks.rst

Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>

* update wordlist.txt

* Update docs/how-to/rocm-for-ai/includes/system-health-benchmarks.rst

Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>

* inference --> AI workloads

* udpate toc

* update article_pages in conf.py

* Update system validation notes in training docs

* fix links in prerequisite-system-validation

* wording

* add note

* consistency

* remove extra files

* fix links

* add links to training index page

---------

Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>
2025-05-13 15:54:48 -04:00
Daniel Su
a940f3f090 Ex CI: add sparse option to checkout template (#4701)
* Ex CI: add sparse option to checkout template

* replace Pipeline.Workspace with Agent.BuildDirectory for consistency
2025-05-13 14:46:48 -04:00
Daniel Su
95415d5e70 Ex CI: remove firstRenderDeviceAccess demand from all components (#4734) 2025-05-13 13:08:27 -04:00
Istvan Kiss
d1772b9ca3 Fix unsupported section structure on JAX (#4733) 2025-05-13 17:39:25 +02:00
Istvan Kiss
f65e1412df Fix compatibility list (#4731) 2025-05-13 16:26:36 +02:00
Istvan Kiss
ea1072b11d JAX compatibility page upate (#4727) 2025-05-08 19:31:13 +02:00
34 changed files with 348 additions and 379 deletions

View File

@@ -77,7 +77,8 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
componentName: clr
cmakeBuildDir: 'clr/build'
cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
extraBuildFlags: >-
-DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
-DHIP_PLATFORM=amd
@@ -138,7 +139,8 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
componentName: clr
cmakeBuildDir: 'clr/build'
cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
extraBuildFlags: >-
-DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
-DHIP_PLATFORM=nvidia

View File

@@ -73,6 +73,7 @@ jobs:
parameters:
componentName: upstream-llvm
cmakeBuildDir: $(Pipeline.Workspace)/llvm-project/llvm/build
cmakeSourceDir: $(Pipeline.Workspace)/llvm-project/llvm
installDir: $(Pipeline.Workspace)/llvm
extraBuildFlags: >-
-DCMAKE_BUILD_TYPE=Release

View File

@@ -118,6 +118,7 @@ jobs:
parameters:
componentName: extras
cmakeBuildDir: '$(Build.SourcesDirectory)/aomp-extras/build'
cmakeSourceDir: '$(Build.SourcesDirectory)/aomp-extras'
installDir: '$(Build.BinariesDirectory)/llvm'
extraBuildFlags: >-
-DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm
@@ -129,6 +130,7 @@ jobs:
parameters:
componentName: openmp
cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/openmp/build'
cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/openmp'
installDir: '$(Build.BinariesDirectory)/llvm'
extraBuildFlags: >-
-DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
@@ -155,6 +157,7 @@ jobs:
parameters:
componentName: offload
cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/offload/build'
cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/offload'
installDir: '$(Build.BinariesDirectory)/llvm'
extraBuildFlags: >-
-DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"

View File

@@ -92,7 +92,8 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
componentName: external
cmakeBuildDir: 'deps/build'
cmakeBuildDir: '$(Build.SourcesDirectory)/deps/build'
cmakeSourceDir: '$(Build.SourcesDirectory)/deps'
installDir: '$(Pipeline.Workspace)/deps-install'
extraBuildFlags: >-
-DBUILD_BOOST=OFF

View File

@@ -83,7 +83,8 @@ jobs:
-DROCM_LLVM_BACKWARD_COMPAT_LINK=$(Build.BinariesDirectory)/llvm
-DROCM_LLVM_BACKWARD_COMPAT_LINK_TARGET=./lib/llvm
-GNinja
cmakeBuildDir: 'llvm/build'
cmakeBuildDir: '$(Build.SourcesDirectory)/llvm/build'
cmakeSourceDir: '$(Build.SourcesDirectory)/llvm'
installDir: '$(Build.BinariesDirectory)/llvm'
# use llvm-lit to run unit tests for llvm, clang, and lld
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
@@ -121,7 +122,8 @@ jobs:
extraBuildFlags: >-
-DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build"
-DCMAKE_BUILD_TYPE=Release
cmakeBuildDir: 'amd/device-libs/build'
cmakeBuildDir: '$(Build.SourcesDirectory)/amd/device-libs/build'
cmakeSourceDir: '$(Build.SourcesDirectory)/amd/device-libs'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
componentName: comgr
@@ -129,7 +131,8 @@ jobs:
-DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build;$(Build.SourcesDirectory)/amd/device-libs/build"
-DCOMGR_DISABLE_SPIRV=1
-DCMAKE_BUILD_TYPE=Release
cmakeBuildDir: 'amd/comgr/build'
cmakeBuildDir: '$(Build.SourcesDirectory)/amd/comgr/build'
cmakeSourceDir: '$(Build.SourcesDirectory)/amd/comgr'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: comgr
@@ -142,7 +145,8 @@ jobs:
extraBuildFlags: >-
-DCMAKE_BUILD_TYPE=Release
-DHIPCC_BACKWARD_COMPATIBILITY=OFF
cmakeBuildDir: 'amd/hipcc/build'
cmakeBuildDir: '$(Build.SourcesDirectory)/amd/hipcc/build'
cmakeSourceDir: '$(Build.SourcesDirectory)/amd/hipcc'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml

View File

@@ -105,6 +105,7 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
cmakeBuildDir: $(Build.SourcesDirectory)/grpc/build
cmakeSourceDir: $(Build.SourcesDirectory)/grpc
installDir: $(Build.SourcesDirectory)/bin
extraBuildFlags: >-
-DgRPC_INSTALL=ON

View File

@@ -125,6 +125,7 @@ jobs:
parameters:
componentName: PyBind11
cmakeBuildDir: '$(Build.SourcesDirectory)/pybind11/build'
cmakeSourceDir: '$(Build.SourcesDirectory)/pybind11'
customInstallPath: false
installEnabled: false
extraBuildFlags: >-
@@ -141,6 +142,7 @@ jobs:
parameters:
componentName: RapidJSON
cmakeBuildDir: '$(Build.SourcesDirectory)/rapidjson/build'
cmakeSourceDir: '$(Build.SourcesDirectory)/rapidjson'
customInstallPath: false
installEnabled: false
extraBuildFlags: >-
@@ -200,7 +202,6 @@ jobs:
value: $(Agent.BuildDirectory)/rocm/include/rocal
pool:
name: ${{ job.target }}_test_pool
demands: firstRenderDeviceAccess
workspace:
clean: all
steps:

View File

@@ -108,7 +108,6 @@ jobs:
value: $(Agent.BuildDirectory)/rocm
pool:
name: ${{ job.target }}_test_pool
demands: firstRenderDeviceAccess
workspace:
clean: all
steps:

View File

@@ -114,7 +114,6 @@ jobs:
value: $(Agent.BuildDirectory)/rocm
pool:
name: ${{ job.target }}_test_pool
demands: firstRenderDeviceAccess
workspace:
clean: all
steps:

View File

@@ -5,6 +5,12 @@ parameters:
- name: checkoutRef
type: string
default: ''
- name: sparseCheckout
type: boolean
default: false
- name: sparseCheckoutDir
type: string
default: ''
# set to true if doing full build of ROCm stack
# and dependencies are pulled from same pipeline
- name: aggregatePipeline
@@ -66,6 +72,8 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
sparseCheckout: ${{ parameters.sparseCheckout }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}

View File

@@ -168,7 +168,6 @@ jobs:
value: $(Agent.BuildDirectory)/rocm
pool:
name: ${{ job.target }}_test_pool
demands: firstRenderDeviceAccess
workspace:
clean: all
steps:

View File

@@ -105,6 +105,7 @@ jobs:
-DLAPACKE=OFF
-GNinja
cmakeBuildDir: '$(Build.SourcesDirectory)/lapack/build'
cmakeSourceDir: '$(Build.SourcesDirectory)/lapack'
installDir: '$(Pipeline.Workspace)/deps-install'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:

View File

@@ -167,7 +167,6 @@ jobs:
value: $(Agent.BuildDirectory)/rocm
pool:
name: ${{ job.target }}_test_pool
demands: firstRenderDeviceAccess
workspace:
clean: all
steps:

View File

@@ -38,6 +38,7 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
cmakeBuildDir: $(Agent.BuildDirectory)/grpc/build
cmakeSourceDir: $(Agent.BuildDirectory)/grpc
extraBuildFlags: >-
-DgRPC_INSTALL=ON
-DgRPC_BUILD_TESTS=OFF

View File

@@ -38,6 +38,7 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
cmakeBuildDir: $(Agent.BuildDirectory)/googletest/build
cmakeSourceDir: $(Agent.BuildDirectory)/googletest
extraBuildFlags: >-
-DGTEST_FORCE_SHARED_CRT=ON
-DCMAKE_DEBUG_POSTFIX=d

View File

@@ -10,10 +10,10 @@ parameters:
default: ''
- name: cmakeBuildDir
type: string
default: 'build'
default: $(Agent.BuildDirectory)/s/build
- name: cmakeSourceDir
type: string
default: '..'
default: $(Agent.BuildDirectory)/s
- name: customBuildTarget
type: string
default: ''
@@ -46,7 +46,7 @@ steps:
${{ if eq(parameters.customInstallPath, true) }}:
cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
${{ else }}:
cmakeArgs: ${{ parameters.extraBuildFlags }} ..
cmakeArgs: ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
- ${{ if parameters.printDiskSpace }}:
- script: df -h
displayName: Disk space before build

View File

@@ -4,6 +4,12 @@ parameters:
- name: checkoutRepo
type: string
default: 'self'
- name: sparseCheckout
type: boolean
default: false
- name: sparseCheckoutDir
type: string
default: ''
# submodule download behaviour
# change to 'recursive' for repos with submodules
- name: submoduleBehaviour
@@ -15,3 +21,13 @@ steps:
clean: true
submodules: ${{ parameters.submoduleBehaviour }}
retryCountOnTaskFailure: 3
fetchFilter: blob:none
${{ if eq(parameters.sparseCheckout, true) }}:
sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }}
path: sparse
- ${{ if eq(parameters.sparseCheckout, true) }}:
- task: Bash@3
displayName: Symlink sparse checkout
inputs:
targetType: inline
script: ln -s $(Agent.BuildDirectory)/sparse/${{ parameters.sparseCheckoutDir }} $(Agent.BuildDirectory)/s

View File

@@ -106,6 +106,7 @@ parameters:
type: object
default:
- gfx90a
- gfx942
steps:
# these steps should only be run if there was a failure or warning

View File

@@ -34,6 +34,7 @@ Autocast
BARs
BLAS
BMC
BabelStream
Blit
Blockwise
Bluefield
@@ -138,6 +139,7 @@ GDR
GDS
GEMM
GEMMs
GFLOPS
GFortran
GFXIP
Gemma
@@ -641,6 +643,7 @@ hipSPARSELt
hipTensor
hipamd
hipblas
hipcc
hipcub
hipfft
hipfort

View File

@@ -14,17 +14,18 @@ JAX provides a NumPy-like API, which combines automatic differentiation and the
Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine
learning at scale.
JAX uses composable transformations of Python and NumPy through just-in-time (JIT) compilation,
automatic vectorization, and parallelization. To learn about JAX, including profiling and
optimizations, see the official `JAX documentation
JAX uses composable transformations of Python and NumPy through just-in-time
(JIT) compilation, automatic vectorization, and parallelization. To learn about
JAX, including profiling and optimizations, see the official `JAX documentation
<https://jax.readthedocs.io/en/latest/notebooks/quickstart.html>`_.
ROCm support for JAX is upstreamed and users can build the official source code with ROCm
support:
ROCm support for JAX is upstreamed, and users can build the official source code
with ROCm support:
- ROCm JAX release:
- Offers AMD-validated and community :ref:`Docker images <jax-docker-compat>` with ROCm and JAX pre-installed.
- Offers AMD-validated and community :ref:`Docker images <jax-docker-compat>`
with ROCm and JAX preinstalled.
- ROCm JAX repository: `ROCm/jax <https://github.com/ROCm/jax>`_
@@ -36,8 +37,8 @@ support:
- Official JAX repository: `jax-ml/jax <https://github.com/jax-ml/jax>`_
- See the `AMD GPU (Linux) installation section
<https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`_ in the JAX
documentation.
<https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`_ in
the JAX documentation.
.. note::
@@ -46,6 +47,44 @@ support:
`Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
follow upstream JAX releases and use the latest available ROCm version.
Use cases and recommendations
================================================================================
* The `nanoGPT in JAX <https://rocm.blogs.amd.com/artificial-intelligence/nanoGPT-JAX/README.html>`_
blog explores the implementation and training of a Generative Pre-trained
Transformer (GPT) model in JAX, inspired by Andrej Karpathys JAX-based
nanoGPT. Comparing how essential GPT components—such as self-attention
mechanisms and optimizers—are realized in JAX and JAX, also highlights
JAXs unique features.
* The `Optimize GPT Training: Enabling Mixed Precision Training in JAX using
ROCm on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/jax-mixed-precision/README.html>`_
blog post provides a comprehensive guide on enhancing the training efficiency
of GPT models by implementing mixed precision techniques in JAX, specifically
tailored for AMD GPUs utilizing the ROCm platform.
* The `Supercharging JAX with Triton Kernels on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/jax-triton/README.html>`_
blog demonstrates how to develop a custom fused dropout-activation kernel for
matrices using Triton, integrate it with JAX, and benchmark its performance
using ROCm.
* The `Distributed fine-tuning with JAX on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/distributed-sft-jax/README.html>`_
outlines the process of fine-tuning a Bidirectional Encoder Representations
from Transformers (BERT)-based large language model (LLM) using JAX for a text
classification task. The blog post discuss techniques for parallelizing the
fine-tuning across multiple AMD GPUs and assess the model's performance on a
holdout dataset. During the fine-tuning, a BERT-base-cased transformer model
and the General Language Understanding Evaluation (GLUE) benchmark dataset was
used on a multi-GPU setup.
* The `MI300X workload optimization guide <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html>`_
provides detailed guidance on optimizing workloads for the AMD Instinct MI300X
accelerator using ROCm. The page is aimed at helping users achieve optimal
performance for deep learning and other high-performance computing tasks on
the MI300X GPU.
For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.blogs.amd.com/blog/tag/jax.html>`_.
.. _jax-docker-compat:
Docker image compatibility
@@ -57,7 +96,7 @@ Docker image compatibility
AMD validates and publishes ready-made `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
with ROCm backends on Docker Hub. The following Docker image tags and
associated inventories are validated for
associated inventories represent the latest JAX version from the official Docker Hub and are validated for
`ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_. Click the |docker-icon|
icon to view the image on Docker Hub.
@@ -121,13 +160,12 @@ associated inventories are tested for `ROCm 6.3.2 <https://repo.radeon.com/rocm/
- Ubuntu 22.04
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
Critical ROCm libraries for JAX
Key ROCm libraries for JAX
================================================================================
The functionality of JAX with ROCm is determined by its underlying library
dependencies. These critical ROCm components affect the capabilities,
performance, and feature set available to developers. The versions described
are available in ROCm :version:`rocm_version`.
JAX functionality on ROCm is determined by its underlying library
dependencies. These ROCm components affect the capabilities, performance, and
feature set available to developers.
.. list-table::
:header-rows: 1
@@ -215,10 +253,10 @@ are available in ROCm :version:`rocm_version`.
distributed training, which involves parallel reductions or
operations like ``jax.numpy.cumsum`` can use rocThrust.
Supported and unsupported features
Supported features
===============================================================================
The following table maps GPU-accelerated JAX modules to their supported
The following table maps the public JAX API modules to their supported
ROCm and JAX versions.
.. list-table::
@@ -226,8 +264,8 @@ ROCm and JAX versions.
* - Module
- Description
- Since JAX
- Since ROCm
- As of JAX
- As of ROCm
* - ``jax.numpy``
- Implements the NumPy API, using the primitives in ``jax.lax``.
- 0.1.56
@@ -255,21 +293,11 @@ ROCm and JAX versions.
devices.
- 0.3.20
- 5.1.0
* - ``jax.dlpack``
- For exchanging tensor data between JAX and other libraries that support the
DLPack standard.
- 0.1.57
- 5.0.0
* - ``jax.distributed``
- Enables the scaling of computations across multiple devices on a single
machine or across multiple machines.
- 0.1.74
- 5.0.0
* - ``jax.dtypes``
- Provides utilities for working with and managing data types in JAX
arrays and computations.
- 0.1.66
- 5.0.0
* - ``jax.image``
- Contains image manipulation functions like resize, scale and translation.
- 0.1.57
@@ -283,27 +311,10 @@ ROCm and JAX versions.
array.
- 0.1.57
- 5.0.0
* - ``jax.profiler``
- Contains JAXs tracing and time profiling features.
- 0.1.57
- 5.0.0
* - ``jax.stages``
- Contains interfaces to stages of the compiled execution process.
- 0.3.4
- 5.0.0
* - ``jax.tree``
- Provides utilities for working with tree-like container data structures.
- 0.4.26
- 5.6.0
* - ``jax.tree_util``
- Provides utilities for working with nested data structures, or
``pytrees``.
- 0.1.65
- 5.0.0
* - ``jax.typing``
- Provides JAX-specific static type annotations.
- 0.3.18
- 5.1.0
* - ``jax.extend``
- Provides modules for access to JAX internal machinery module. The
``jax.extend`` module defines a library view of some of JAXs internal
@@ -339,8 +350,8 @@ A SciPy-like API for scientific computing.
:header-rows: 1
* - Module
- Since JAX
- Since ROCm
- As of JAX
- As of ROCm
* - ``jax.scipy.cluster``
- 0.3.11
- 5.1.0
@@ -385,8 +396,8 @@ jax.scipy.stats module
:header-rows: 1
* - Module
- Since JAX
- Since ROCm
- As of JAX
- As of ROCm
* - ``jax.scipy.stats.bernouli``
- 0.1.56
- 5.0.0
@@ -469,8 +480,8 @@ Modules for JAX extensions.
:header-rows: 1
* - Module
- Since JAX
- Since ROCm
- As of JAX
- As of ROCm
* - ``jax.extend.ffi``
- 0.4.30
- 6.0.0
@@ -484,190 +495,25 @@ Modules for JAX extensions.
- 0.4.15
- 5.5.0
jax.experimental module
-------------------------------------------------------------------------------
Experimental modules and APIs.
.. list-table::
:header-rows: 1
* - Module
- Since JAX
- Since ROCm
* - ``jax.experimental.checkify``
- 0.1.75
- 5.0.0
* - ``jax.experimental.compilation_cache.compilation_cache``
- 0.1.68
- 5.0.0
* - ``jax.experimental.custom_partitioning``
- 0.4.0
- 5.3.0
* - ``jax.experimental.jet``
- 0.1.56
- 5.0.0
* - ``jax.experimental.key_reuse``
- 0.4.26
- 5.6.0
* - ``jax.experimental.mesh_utils``
- 0.1.76
- 5.0.0
* - ``jax.experimental.multihost_utils``
- 0.3.2
- 5.0.0
* - ``jax.experimental.pallas``
- 0.4.15
- 5.5.0
* - ``jax.experimental.pjit``
- 0.1.61
- 5.0.0
* - ``jax.experimental.serialize_executable``
- 0.4.0
- 5.3.0
* - ``jax.experimental.shard_map``
- 0.4.3
- 5.3.0
* - ``jax.experimental.sparse``
- 0.1.75
- 5.0.0
.. list-table::
:header-rows: 1
* - API
- Since JAX
- Since ROCm
* - ``jax.experimental.enable_x64``
- 0.1.60
- 5.0.0
* - ``jax.experimental.disable_x64``
- 0.1.60
- 5.0.0
jax.experimental.pallas module
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Module for Pallas, a JAX extension for custom kernels.
.. list-table::
:header-rows: 1
* - Module
- Since JAX
- Since ROCm
* - ``jax.experimental.pallas.mosaic_gpu``
- 0.4.31
- 6.1.3
* - ``jax.experimental.pallas.tpu``
- 0.4.15
- 5.5.0
* - ``jax.experimental.pallas.triton``
- 0.4.32
- 6.1.3
jax.experimental.sparse module
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Experimental support for sparse matrix operations.
.. list-table::
:header-rows: 1
* - Module
- Since JAX
- Since ROCm
* - ``jax.experimental.sparse.linalg``
- 0.3.15
- 5.2.0
* - ``jax.experimental.sparse.sparsify``
- 0.3.25
- ❌
.. list-table::
:header-rows: 1
* - ``sparse`` data structure API
- Since JAX
- Since ROCm
* - ``jax.experimental.sparse.BCOO``
- 0.1.72
- 5.0.0
* - ``jax.experimental.sparse.BCSR``
- 0.3.20
- 5.1.0
* - ``jax.experimental.sparse.CSR``
- 0.1.75
- 5.0.0
* - ``jax.experimental.sparse.NM``
- 0.4.27
- 5.6.0
* - ``jax.experimental.sparse.COO``
- 0.1.75
- 5.0.0
Unsupported JAX features
------------------------
===============================================================================
The following are GPU-accelerated JAX features not currently supported by
ROCm.
The following GPU-accelerated JAX features are not supported by ROCm for
the listed supported JAX versions.
.. list-table::
:header-rows: 1
* - Feature
- Description
- Since JAX
* - Mixed Precision with TF32
- Mixed precision with TF32 is used for matrix multiplications,
convolutions, and other linear algebra operations, particularly in
deep learning workloads like CNNs and transformers.
- 0.2.25
* - RNN support
- Currently only LSTM with double bias is supported with float32 input
and weight.
- 0.3.25
* - XLA int4 support
- 4-bit integer (int4) precision in the XLA compiler.
- 0.4.0
* - ``jax.experimental.sparsify``
- Converts a dense matrix to a sparse matrix representation.
- Experimental
Use cases and recommendations
================================================================================
* The `nanoGPT in JAX <https://rocm.blogs.amd.com/artificial-intelligence/nanoGPT-JAX/README.html>`_
blog explores the implementation and training of a Generative Pre-trained
Transformer (GPT) model in JAX, inspired by Andrej Karpathys PyTorch-based
nanoGPT. By comparing how essential GPT components—such as self-attention
mechanisms and optimizers—are realized in PyTorch and JAX, also highlight
JAXs unique features.
* The `Optimize GPT Training: Enabling Mixed Precision Training in JAX using
ROCm on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/jax-mixed-precision/README.html>`_
blog post provides a comprehensive guide on enhancing the training efficiency
of GPT models by implementing mixed precision techniques in JAX, specifically
tailored for AMD GPUs utilizing the ROCm platform.
* The `Supercharging JAX with Triton Kernels on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/jax-triton/README.html>`_
blog demonstrates how to develop a custom fused dropout-activation kernel for
matrices using Triton, integrate it with JAX, and benchmark its performance
using ROCm.
* The `Distributed fine-tuning with JAX on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/distributed-sft-jax/README.html>`_
outlines the process of fine-tuning a Bidirectional Encoder Representations
from Transformers (BERT)-based large language model (LLM) using JAX for a text
classification task. The blog post discuss techniques for parallelizing the
fine-tuning across multiple AMD GPUs and assess the model's performance on a
holdout dataset. During the fine-tuning, a BERT-base-cased transformer model
and the General Language Understanding Evaluation (GLUE) benchmark dataset was
used on a multi-GPU setup.
* The `MI300X workload optimization guide <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html>`_
provides detailed guidance on optimizing workloads for the AMD Instinct MI300X
accelerator using ROCm. The page is aimed at helping users achieve optimal
performance for deep learning and other high-performance computing tasks on
the MI300X GPU.
For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.blogs.amd.com/blog/tag/jax.html>`_.
* - MOSAIC (GPU)
- Mosaic is a library of kernel-building abstractions for JAX's Pallas system

View File

@@ -51,6 +51,8 @@ article_pages = [
{"file": "how-to/deep-learning-rocm", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
@@ -67,7 +69,6 @@ article_pages = [
{"file": "how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference/install", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference/vllm-benchmark", "os": ["linux"]},

View File

@@ -62,47 +62,52 @@ PyTorch inference performance testing
{% endfor %}
{% endfor %}
Getting started
===============
System validation
=================
Use the following procedures to reproduce the benchmark results on an
MI300X series accelerator with the prebuilt PyTorch Docker image.
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
.. _pytorch-benchmark-get-started:
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
might hang until the periodic balancing is finalized. For more information,
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
1. Disable NUMA auto-balancing.
.. code-block:: shell
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
might hang until the periodic balancing is finalized. For more information,
see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
# disable automatic NUMA balancing
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
# check if NUMA balancing is disabled (returns 0 if disabled)
cat /proc/sys/kernel/numa_balancing
0
.. code-block:: shell
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
# disable automatic NUMA balancing
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
# check if NUMA balancing is disabled (returns 0 if disabled)
cat /proc/sys/kernel/numa_balancing
0
Pull the Docker image
=====================
.. container:: model-doc pyt_chai1_inference
2. Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue/images/sha256-b736a4239ab38a9d0e448af6d4adca83b117debed00bfbe33846f99c4540f79b>`_ from Docker Hub.
Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue/images/sha256-b736a4239ab38a9d0e448af6d4adca83b117debed00bfbe33846f99c4540f79b>`_ from Docker Hub.
.. code-block:: shell
.. code-block:: shell
docker pull rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue
docker pull rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue
.. note::
.. note::
The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
.. container:: model-doc pyt_clip_inference
2. Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.
Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.
.. code-block:: shell
.. code-block:: shell
docker pull rocm/pytorch:latest
docker pull rocm/pytorch:latest
.. _pytorch-benchmark-get-started:
Benchmarking
============

View File

@@ -111,35 +111,37 @@ vLLM inference performance testing
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.
Getting started
===============
System validation
=================
Use the following procedures to reproduce the benchmark results on an
MI300X accelerator with the prebuilt vLLM Docker image.
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
.. _vllm-benchmark-get-started:
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
might hang until the periodic balancing is finalized. For more information,
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
1. Disable NUMA auto-balancing.
.. code-block:: shell
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
might hang until the periodic balancing is finalized. For more information,
see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
# disable automatic NUMA balancing
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
# check if NUMA balancing is disabled (returns 0 if disabled)
cat /proc/sys/kernel/numa_balancing
0
.. code-block:: shell
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
# disable automatic NUMA balancing
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
# check if NUMA balancing is disabled (returns 0 if disabled)
cat /proc/sys/kernel/numa_balancing
0
Pull the Docker image
=====================
2. Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
Use the following command to pull the Docker image from Docker Hub.
Use the following command to pull the Docker image from Docker Hub.
.. code-block:: shell
.. code-block:: shell
docker pull {{ unified_docker.pull_tag }}
docker pull {{ unified_docker.pull_tag }}
Benchmarking
============

View File

@@ -30,7 +30,7 @@ ROCm supports multiple :doc:`installation methods <rocm-install-on-linux:install
* :doc:`Using the AMDGPU installer <rocm-install-on-linux:install/amdgpu-install>`
* :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`.
* :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`
.. grid:: 1
@@ -59,4 +59,8 @@ images with the framework pre-installed.
* :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
The sections that follow in :doc:`Training a model <../training/train-a-model>` are geared for a ROCm with PyTorch installation.
Next steps
==========
After installing ROCm and your desired ML libraries -- and before running AI workloads -- conduct system health benchmarks
to test the optimal performance of your AMD hardware. See :doc:`system-health-check` to get started.

View File

@@ -0,0 +1,104 @@
.. meta::
:description: System health checks with RVS, RCCL tests, BabelStream, and TransferBench to validate AMD hardware performance running AI workloads.
:keywords: gpu, accelerator, system, health, validation, bench, perf, performance, rvs, rccl, babel, mi300x, mi325x, flops, bandwidth, rbt, training, inference
.. _rocm-for-ai-system-health-bench:
************************
System health benchmarks
************************
Before running AI workloads, it is important to validate that your AMD hardware is configured correctly and is performing optimally. This topic outlines several system health benchmarks you can use to test key aspects like GPU compute capabilities (FLOPS), memory bandwidth, and interconnect performance. Many of these tests are part of the ROCm Validation Suite (RVS).
ROCm Validation Suite (RVS) tests
=================================
RVS provides a collection of tests, benchmarks, and qualification tools, each
targeting a specific subsystem of the system under test. It includes tests for
GPU stress and memory bandwidth.
.. _healthcheck-install-rvs:
Install ROCm Validation Suite
-----------------------------
To get started, install RVS. For example, on an Ubuntu system with ROCm already
installed, run the following command:
.. code-block:: shell
sudo apt update
sudo apt install rocm-validation-suite
See the `ROCm Validation Suite installation instructions <https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/install/installation.html>`_,
and `System validation tests <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#system-validation-tests>`_
in the Instinct documentation for more detailed instructions.
Benchmark, stress, and qualification tests
------------------------------------------
The GPU stress test runs various GEMM computations as workloads to stress the GPU FLOPS performance and check whether it
meets the configured target GFLOPS.
Run the benchmark, stress, and qualification tests included with RVS. See the `Benchmark, stress, qualification
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#benchmark-stress-qualification>`_
section of the Instinct documentation for usage instructions.
BabelStream test
----------------
BabelStream is a synthetic GPU benchmark based on the STREAM benchmark for
CPUs, measuring memory transfer rates to and from global device memory.
BabelStream tests are included with the RVS package as part of the `BABEL module
<https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/conceptual/rvs-modules.html#babel-benchmark-test-babel-module>`_.
For more information, see `Performance benchmarking
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#babelstream-benchmarking-results>`_
in the Instinct documentation.
RCCL tests
==========
The ROCm Communication Collectives Library (RCCL) enables efficient multi-GPU
communication. The `<https://github.com/ROCm/rccl-tests>`__ suite benchmarks
the performance and verifies the correctness of these collective operations.
This helps ensure optimal scaling for multi-accelerator tasks.
1. To get started, build RCCL-tests using the official instructions in the README at
`<https://github.com/ROCm/rccl-tests?tab=readme-ov-file#build>`__ or use the
following commands:
.. code-block:: shell
git clone https://github.com/ROCm/rccl-tests.git
cd rccl-tests
make
2. Run the suggested RCCL tests -- see `RCCL benchmarking
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#rccl-benchmarking-results>`_
in the Instinct performance benchmarking documentation for instructions.
TransferBench test
==================
TransferBench is a standalone utility for benchmarking simultaneous data
transfer performance between various devices in the system, including
CPU-to-GPU and GPU-to-GPU (peer-to-peer). This helps identify potential
bottlenecks in data movement between the host system and the GPUs, or between
GPUs, which can impact end-to-end latency.
.. _healthcheck-install-transferbench:
1. To get started, use the instructions in the `TransferBench documentation
<https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`_
or use the following commands:
.. code:: shell
git clone https://github.com/ROCm/TransferBench.git
cd TransferBench
CC=hipcc make
2. Run the suggested TransferBench tests -- see `TransferBench benchmarking
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#transferbench-benchmarking-results>`_
in the Instinct performance benchmarking documentation for instructions.

View File

@@ -79,11 +79,18 @@ across different input sequences. Support for packed input format is planned for
System validation
=================
If you have already validated your system settings, including NUMA
auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
and optimization steps <train-a-model-system-validation>` to set up your system
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
before starting training.
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
Environment setup
=================
@@ -175,8 +182,8 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
.. _amd-maxtext-download-docker:
Download the Docker image
-------------------------
Pull the Docker image
---------------------
1. Use the following command to pull the Docker image from Docker Hub.

View File

@@ -103,11 +103,18 @@ popular AI models.
System validation
=================
If you have already validated your system settings, including NUMA
auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
and optimization steps <train-a-model-system-validation>` to set up your system
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
before starting training.
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
.. _mi300x-amd-megatron-lm-training:
Environment setup

View File

@@ -34,11 +34,18 @@ for MPT-30B with access to detailed logs and performance metrics.
System validation
=================
If you have already validated your system settings, including NUMA
auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
and optimization steps <train-a-model-system-validation>` to set up your system
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
before starting training.
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
Getting started
===============

View File

@@ -77,11 +77,18 @@ popular AI models.
System validation
=================
If you have already validated your system settings, including NUMA
auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
and optimization steps <train-a-model-system-validation>` to set up your system
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
before starting training.
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
This Docker image is optimized for specific model configurations outlined
below. Performance can vary for other training workloads, as AMD
doesnt validate configurations and run conditions outside those described.

View File

@@ -21,8 +21,12 @@ In this guide, you'll learn about:
- Training a model
- :doc:`Train a model with Megatron-LM <benchmark-docker/megatron-lm>`
- :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`
- :doc:`Train a model with PyTorch <benchmark-docker/pytorch-training>`
- :doc:`With PyTorch <benchmark-docker/pytorch-training>`
- :doc:`With JAX MaxText <benchmark-docker/jax-maxtext>`
- :doc:`With LLM Foundry <benchmark-docker/mpt-llm-foundry>`
- :doc:`Scaling model training <scale-model-training>`

View File

@@ -5,12 +5,13 @@
:keywords: ROCm, AI, LLM, train, megatron, Llama, tutorial, docker, torch, pytorch, jax
.. _train-a-model-system-validation:
.. _rocm-for-ai-system-optimization:
**********************************************
Prerequisite system validation before training
**********************************************
**********************************************************
Prerequisite system validation before running AI workloads
**********************************************************
Complete the following system validation and optimization steps to set up your system before starting training.
Complete the following system validation and optimization steps to set up your system before starting training and inference.
Disable NUMA auto-balancing
---------------------------
@@ -26,7 +27,8 @@ the output is ``1``, run the following command to disable NUMA auto-balancing.
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
See :ref:`mi300x-disable-numa` for more information.
See `Disable NUMA auto-balancing <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#disable-numa-auto-balancing>`_
in the Instinct documentation for more information.
Hardware verification with ROCm
-------------------------------
@@ -42,7 +44,8 @@ Run the command:
rocm-smi --setperfdeterminism 1900
See :ref:`mi300x-hardware-verification-with-rocm` for more information.
See `Hardware verfication for ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_
in the Instinct documentation for more information.
RCCL Bandwidth Test for multi-node setups
-----------------------------------------

View File

@@ -1,68 +0,0 @@
---
myst:
html_meta:
"description": "Learn more about common system-level debugging measures for ROCm."
"keywords": "env, var, sys, PCIe, troubleshooting, admin, error"
---
# System debugging
## ROCm language and system-level debug, flags, and environment variables
Kernel options to avoid: the Ethernet port getting renamed every time you change graphics cards, `net.ifnames=0 biosdevname=0`
## ROCr error code
* 2 Invalid Dimension
* 4 Invalid Group Memory
* 8 Invalid (or Null) Code
* 32 Invalid Format
* 64 Group is too large
* 128 Out of VGPRs
* 0x80000000 Debug Options
## Command to dump firmware version and get Linux kernel version
`sudo cat /sys/kernel/debug/dri/1/amdgpu_firmware_info`
`uname -a`
## Debug flags
Debug messages when developing/debugging base ROCm driver. You could enable the printing from `libhsakmt.so` by setting an environment variable, `HSAKMT_DEBUG_LEVEL`. Available debug levels are 3-7. The higher level you set, the more messages will print.
* `export HSAKMT_DEBUG_LEVEL=3` : Only pr_err() prints.
* `export HSAKMT_DEBUG_LEVEL=4` : pr_err() and pr_warn() print.
* `export HSAKMT_DEBUG_LEVEL=5` : We currently do not implement “notice”. Setting to 5 is same as setting to 4.
* `export HSAKMT_DEBUG_LEVEL=6` : pr_err(), pr_warn(), and pr_info print.
* `export HSAKMT_DEBUG_LEVEL=7` : Everything including pr_debug prints.
## ROCr level environment variables for debug
`HSA_ENABLE_SDMA=0`
`HSA_ENABLE_INTERRUPT=0`
`HSA_SVM_GUARD_PAGES=0`
`HSA_DISABLE_CACHE=1`
## Turn off page retry on GFX9/Vega devices
`sudo -s`
`echo 1 > /sys/module/amdkfd/parameters/noretry`
## HIP environment variables 3.x
### OpenCL debug flags
`AMD_OCL_WAIT_COMMAND=1 (0 = OFF, 1 = On)`
## PCIe-debug
For information on how to debug and profile HIP applications, see {doc}`hip:how-to/debugging`

View File

@@ -42,7 +42,6 @@ ROCm documentation is organized into the following categories:
* [Use ROCm for HPC](./how-to/rocm-for-hpc/index.rst)
* [System optimization](./how-to/system-optimization/index.rst)
* [AMD Instinct MI300X performance validation and tuning](./how-to/tuning-guides/mi300x/index.rst)
* [System debugging](./how-to/system-debugging.md)
* [Use advanced compiler features](./conceptual/compiler-topics.md)
* [Set the number of CUs](./how-to/setting-cus)
* [Troubleshoot BAR access limitation](./how-to/Bar-Memory.rst)

View File

@@ -36,6 +36,10 @@ subtrees:
title: Use ROCm for AI
subtrees:
- entries:
- file: how-to/rocm-for-ai/install.rst
title: Installation
- file: how-to/rocm-for-ai/system-health-check.rst
title: System health benchmarks
- file: how-to/rocm-for-ai/training/index.rst
title: Training
subtrees:
@@ -70,8 +74,6 @@ subtrees:
title: Inference
subtrees:
- entries:
- file: how-to/rocm-for-ai/inference/install.rst
title: Installation
- file: how-to/rocm-for-ai/inference/hugging-face-models.rst
title: Run models from Hugging Face
- file: how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
@@ -107,7 +109,6 @@ subtrees:
title: System optimization
- file: how-to/gpu-performance/mi300x.rst
title: AMD Instinct MI300X performance guides
- file: how-to/system-debugging.md
- file: conceptual/compiler-topics.md
title: Use advanced compiler features
subtrees:
@@ -121,7 +122,7 @@ subtrees:
- file: how-to/setting-cus
title: Set the number of CUs
- file: how-to/Bar-Memory.rst
title: Troubleshoot BAR access limitation
title: Troubleshoot BAR access limitation
- url: https://github.com/amd/rocm-examples
title: ROCm examples