mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-10 15:18:11 -05:00
Compare commits
11 Commits
jax_comp_u
...
docs_remov
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e6d089c5fa | ||
|
|
0d7846fbab | ||
|
|
156917e15d | ||
|
|
d7a9280008 | ||
|
|
c1825ba41c | ||
|
|
0a77e7b3a5 | ||
|
|
a940f3f090 | ||
|
|
95415d5e70 | ||
|
|
d1772b9ca3 | ||
|
|
f65e1412df | ||
|
|
ea1072b11d |
@@ -77,7 +77,8 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: clr
|
||||
cmakeBuildDir: 'clr/build'
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
|
||||
extraBuildFlags: >-
|
||||
-DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
|
||||
-DHIP_PLATFORM=amd
|
||||
@@ -138,7 +139,8 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: clr
|
||||
cmakeBuildDir: 'clr/build'
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
|
||||
extraBuildFlags: >-
|
||||
-DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
|
||||
-DHIP_PLATFORM=nvidia
|
||||
|
||||
@@ -73,6 +73,7 @@ jobs:
|
||||
parameters:
|
||||
componentName: upstream-llvm
|
||||
cmakeBuildDir: $(Pipeline.Workspace)/llvm-project/llvm/build
|
||||
cmakeSourceDir: $(Pipeline.Workspace)/llvm-project/llvm
|
||||
installDir: $(Pipeline.Workspace)/llvm
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
@@ -118,6 +118,7 @@ jobs:
|
||||
parameters:
|
||||
componentName: extras
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/aomp-extras/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/aomp-extras'
|
||||
installDir: '$(Build.BinariesDirectory)/llvm'
|
||||
extraBuildFlags: >-
|
||||
-DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm
|
||||
@@ -129,6 +130,7 @@ jobs:
|
||||
parameters:
|
||||
componentName: openmp
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/openmp/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/openmp'
|
||||
installDir: '$(Build.BinariesDirectory)/llvm'
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
|
||||
@@ -155,6 +157,7 @@ jobs:
|
||||
parameters:
|
||||
componentName: offload
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/offload/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/offload'
|
||||
installDir: '$(Build.BinariesDirectory)/llvm'
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
|
||||
|
||||
@@ -92,7 +92,8 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: external
|
||||
cmakeBuildDir: 'deps/build'
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/deps/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/deps'
|
||||
installDir: '$(Pipeline.Workspace)/deps-install'
|
||||
extraBuildFlags: >-
|
||||
-DBUILD_BOOST=OFF
|
||||
|
||||
@@ -83,7 +83,8 @@ jobs:
|
||||
-DROCM_LLVM_BACKWARD_COMPAT_LINK=$(Build.BinariesDirectory)/llvm
|
||||
-DROCM_LLVM_BACKWARD_COMPAT_LINK_TARGET=./lib/llvm
|
||||
-GNinja
|
||||
cmakeBuildDir: 'llvm/build'
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/llvm/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/llvm'
|
||||
installDir: '$(Build.BinariesDirectory)/llvm'
|
||||
# use llvm-lit to run unit tests for llvm, clang, and lld
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
@@ -121,7 +122,8 @@ jobs:
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build"
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
cmakeBuildDir: 'amd/device-libs/build'
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/amd/device-libs/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/amd/device-libs'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: comgr
|
||||
@@ -129,7 +131,8 @@ jobs:
|
||||
-DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build;$(Build.SourcesDirectory)/amd/device-libs/build"
|
||||
-DCOMGR_DISABLE_SPIRV=1
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
cmakeBuildDir: 'amd/comgr/build'
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/amd/comgr/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/amd/comgr'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: comgr
|
||||
@@ -142,7 +145,8 @@ jobs:
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DHIPCC_BACKWARD_COMPATIBILITY=OFF
|
||||
cmakeBuildDir: 'amd/hipcc/build'
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/amd/hipcc/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/amd/hipcc'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
|
||||
@@ -105,6 +105,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
cmakeBuildDir: $(Build.SourcesDirectory)/grpc/build
|
||||
cmakeSourceDir: $(Build.SourcesDirectory)/grpc
|
||||
installDir: $(Build.SourcesDirectory)/bin
|
||||
extraBuildFlags: >-
|
||||
-DgRPC_INSTALL=ON
|
||||
|
||||
@@ -125,6 +125,7 @@ jobs:
|
||||
parameters:
|
||||
componentName: PyBind11
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/pybind11/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/pybind11'
|
||||
customInstallPath: false
|
||||
installEnabled: false
|
||||
extraBuildFlags: >-
|
||||
@@ -141,6 +142,7 @@ jobs:
|
||||
parameters:
|
||||
componentName: RapidJSON
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/rapidjson/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/rapidjson'
|
||||
customInstallPath: false
|
||||
installEnabled: false
|
||||
extraBuildFlags: >-
|
||||
@@ -200,7 +202,6 @@ jobs:
|
||||
value: $(Agent.BuildDirectory)/rocm/include/rocal
|
||||
pool:
|
||||
name: ${{ job.target }}_test_pool
|
||||
demands: firstRenderDeviceAccess
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
|
||||
@@ -108,7 +108,6 @@ jobs:
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
pool:
|
||||
name: ${{ job.target }}_test_pool
|
||||
demands: firstRenderDeviceAccess
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
|
||||
@@ -114,7 +114,6 @@ jobs:
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
pool:
|
||||
name: ${{ job.target }}_test_pool
|
||||
demands: firstRenderDeviceAccess
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
|
||||
@@ -5,6 +5,12 @@ parameters:
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
- name: sparseCheckout
|
||||
type: boolean
|
||||
default: false
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -66,6 +72,8 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckout: ${{ parameters.sparseCheckout }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
|
||||
@@ -168,7 +168,6 @@ jobs:
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
pool:
|
||||
name: ${{ job.target }}_test_pool
|
||||
demands: firstRenderDeviceAccess
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
|
||||
@@ -105,6 +105,7 @@ jobs:
|
||||
-DLAPACKE=OFF
|
||||
-GNinja
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/lapack/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/lapack'
|
||||
installDir: '$(Pipeline.Workspace)/deps-install'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
|
||||
@@ -167,7 +167,6 @@ jobs:
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
pool:
|
||||
name: ${{ job.target }}_test_pool
|
||||
demands: firstRenderDeviceAccess
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
|
||||
@@ -38,6 +38,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
cmakeBuildDir: $(Agent.BuildDirectory)/grpc/build
|
||||
cmakeSourceDir: $(Agent.BuildDirectory)/grpc
|
||||
extraBuildFlags: >-
|
||||
-DgRPC_INSTALL=ON
|
||||
-DgRPC_BUILD_TESTS=OFF
|
||||
|
||||
@@ -38,6 +38,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
cmakeBuildDir: $(Agent.BuildDirectory)/googletest/build
|
||||
cmakeSourceDir: $(Agent.BuildDirectory)/googletest
|
||||
extraBuildFlags: >-
|
||||
-DGTEST_FORCE_SHARED_CRT=ON
|
||||
-DCMAKE_DEBUG_POSTFIX=d
|
||||
|
||||
@@ -10,10 +10,10 @@ parameters:
|
||||
default: ''
|
||||
- name: cmakeBuildDir
|
||||
type: string
|
||||
default: 'build'
|
||||
default: $(Agent.BuildDirectory)/s/build
|
||||
- name: cmakeSourceDir
|
||||
type: string
|
||||
default: '..'
|
||||
default: $(Agent.BuildDirectory)/s
|
||||
- name: customBuildTarget
|
||||
type: string
|
||||
default: ''
|
||||
@@ -46,7 +46,7 @@ steps:
|
||||
${{ if eq(parameters.customInstallPath, true) }}:
|
||||
cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
|
||||
${{ else }}:
|
||||
cmakeArgs: ${{ parameters.extraBuildFlags }} ..
|
||||
cmakeArgs: ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
|
||||
- ${{ if parameters.printDiskSpace }}:
|
||||
- script: df -h
|
||||
displayName: Disk space before build
|
||||
|
||||
@@ -4,6 +4,12 @@ parameters:
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: sparseCheckout
|
||||
type: boolean
|
||||
default: false
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
# submodule download behaviour
|
||||
# change to 'recursive' for repos with submodules
|
||||
- name: submoduleBehaviour
|
||||
@@ -15,3 +21,13 @@ steps:
|
||||
clean: true
|
||||
submodules: ${{ parameters.submoduleBehaviour }}
|
||||
retryCountOnTaskFailure: 3
|
||||
fetchFilter: blob:none
|
||||
${{ if eq(parameters.sparseCheckout, true) }}:
|
||||
sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }}
|
||||
path: sparse
|
||||
- ${{ if eq(parameters.sparseCheckout, true) }}:
|
||||
- task: Bash@3
|
||||
displayName: Symlink sparse checkout
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: ln -s $(Agent.BuildDirectory)/sparse/${{ parameters.sparseCheckoutDir }} $(Agent.BuildDirectory)/s
|
||||
|
||||
@@ -106,6 +106,7 @@ parameters:
|
||||
type: object
|
||||
default:
|
||||
- gfx90a
|
||||
- gfx942
|
||||
|
||||
steps:
|
||||
# these steps should only be run if there was a failure or warning
|
||||
|
||||
@@ -34,6 +34,7 @@ Autocast
|
||||
BARs
|
||||
BLAS
|
||||
BMC
|
||||
BabelStream
|
||||
Blit
|
||||
Blockwise
|
||||
Bluefield
|
||||
@@ -138,6 +139,7 @@ GDR
|
||||
GDS
|
||||
GEMM
|
||||
GEMMs
|
||||
GFLOPS
|
||||
GFortran
|
||||
GFXIP
|
||||
Gemma
|
||||
@@ -641,6 +643,7 @@ hipSPARSELt
|
||||
hipTensor
|
||||
hipamd
|
||||
hipblas
|
||||
hipcc
|
||||
hipcub
|
||||
hipfft
|
||||
hipfort
|
||||
|
||||
@@ -14,17 +14,18 @@ JAX provides a NumPy-like API, which combines automatic differentiation and the
|
||||
Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine
|
||||
learning at scale.
|
||||
|
||||
JAX uses composable transformations of Python and NumPy through just-in-time (JIT) compilation,
|
||||
automatic vectorization, and parallelization. To learn about JAX, including profiling and
|
||||
optimizations, see the official `JAX documentation
|
||||
JAX uses composable transformations of Python and NumPy through just-in-time
|
||||
(JIT) compilation, automatic vectorization, and parallelization. To learn about
|
||||
JAX, including profiling and optimizations, see the official `JAX documentation
|
||||
<https://jax.readthedocs.io/en/latest/notebooks/quickstart.html>`_.
|
||||
|
||||
ROCm support for JAX is upstreamed and users can build the official source code with ROCm
|
||||
support:
|
||||
ROCm support for JAX is upstreamed, and users can build the official source code
|
||||
with ROCm support:
|
||||
|
||||
- ROCm JAX release:
|
||||
|
||||
- Offers AMD-validated and community :ref:`Docker images <jax-docker-compat>` with ROCm and JAX pre-installed.
|
||||
- Offers AMD-validated and community :ref:`Docker images <jax-docker-compat>`
|
||||
with ROCm and JAX preinstalled.
|
||||
|
||||
- ROCm JAX repository: `ROCm/jax <https://github.com/ROCm/jax>`_
|
||||
|
||||
@@ -36,8 +37,8 @@ support:
|
||||
- Official JAX repository: `jax-ml/jax <https://github.com/jax-ml/jax>`_
|
||||
|
||||
- See the `AMD GPU (Linux) installation section
|
||||
<https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`_ in the JAX
|
||||
documentation.
|
||||
<https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`_ in
|
||||
the JAX documentation.
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -46,6 +47,44 @@ support:
|
||||
`Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
|
||||
follow upstream JAX releases and use the latest available ROCm version.
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* The `nanoGPT in JAX <https://rocm.blogs.amd.com/artificial-intelligence/nanoGPT-JAX/README.html>`_
|
||||
blog explores the implementation and training of a Generative Pre-trained
|
||||
Transformer (GPT) model in JAX, inspired by Andrej Karpathy’s JAX-based
|
||||
nanoGPT. Comparing how essential GPT components—such as self-attention
|
||||
mechanisms and optimizers—are realized in JAX and JAX, also highlights
|
||||
JAX’s unique features.
|
||||
|
||||
* The `Optimize GPT Training: Enabling Mixed Precision Training in JAX using
|
||||
ROCm on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/jax-mixed-precision/README.html>`_
|
||||
blog post provides a comprehensive guide on enhancing the training efficiency
|
||||
of GPT models by implementing mixed precision techniques in JAX, specifically
|
||||
tailored for AMD GPUs utilizing the ROCm platform.
|
||||
|
||||
* The `Supercharging JAX with Triton Kernels on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/jax-triton/README.html>`_
|
||||
blog demonstrates how to develop a custom fused dropout-activation kernel for
|
||||
matrices using Triton, integrate it with JAX, and benchmark its performance
|
||||
using ROCm.
|
||||
|
||||
* The `Distributed fine-tuning with JAX on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/distributed-sft-jax/README.html>`_
|
||||
outlines the process of fine-tuning a Bidirectional Encoder Representations
|
||||
from Transformers (BERT)-based large language model (LLM) using JAX for a text
|
||||
classification task. The blog post discuss techniques for parallelizing the
|
||||
fine-tuning across multiple AMD GPUs and assess the model's performance on a
|
||||
holdout dataset. During the fine-tuning, a BERT-base-cased transformer model
|
||||
and the General Language Understanding Evaluation (GLUE) benchmark dataset was
|
||||
used on a multi-GPU setup.
|
||||
|
||||
* The `MI300X workload optimization guide <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html>`_
|
||||
provides detailed guidance on optimizing workloads for the AMD Instinct MI300X
|
||||
accelerator using ROCm. The page is aimed at helping users achieve optimal
|
||||
performance for deep learning and other high-performance computing tasks on
|
||||
the MI300X GPU.
|
||||
|
||||
For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.blogs.amd.com/blog/tag/jax.html>`_.
|
||||
|
||||
.. _jax-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
@@ -57,7 +96,7 @@ Docker image compatibility
|
||||
|
||||
AMD validates and publishes ready-made `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories are validated for
|
||||
associated inventories represent the latest JAX version from the official Docker Hub and are validated for
|
||||
`ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_. Click the |docker-icon|
|
||||
icon to view the image on Docker Hub.
|
||||
|
||||
@@ -121,13 +160,12 @@ associated inventories are tested for `ROCm 6.3.2 <https://repo.radeon.com/rocm/
|
||||
- Ubuntu 22.04
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
|
||||
Critical ROCm libraries for JAX
|
||||
Key ROCm libraries for JAX
|
||||
================================================================================
|
||||
|
||||
The functionality of JAX with ROCm is determined by its underlying library
|
||||
dependencies. These critical ROCm components affect the capabilities,
|
||||
performance, and feature set available to developers. The versions described
|
||||
are available in ROCm :version:`rocm_version`.
|
||||
JAX functionality on ROCm is determined by its underlying library
|
||||
dependencies. These ROCm components affect the capabilities, performance, and
|
||||
feature set available to developers.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -215,10 +253,10 @@ are available in ROCm :version:`rocm_version`.
|
||||
distributed training, which involves parallel reductions or
|
||||
operations like ``jax.numpy.cumsum`` can use rocThrust.
|
||||
|
||||
Supported and unsupported features
|
||||
Supported features
|
||||
===============================================================================
|
||||
|
||||
The following table maps GPU-accelerated JAX modules to their supported
|
||||
The following table maps the public JAX API modules to their supported
|
||||
ROCm and JAX versions.
|
||||
|
||||
.. list-table::
|
||||
@@ -226,8 +264,8 @@ ROCm and JAX versions.
|
||||
|
||||
* - Module
|
||||
- Description
|
||||
- Since JAX
|
||||
- Since ROCm
|
||||
- As of JAX
|
||||
- As of ROCm
|
||||
* - ``jax.numpy``
|
||||
- Implements the NumPy API, using the primitives in ``jax.lax``.
|
||||
- 0.1.56
|
||||
@@ -255,21 +293,11 @@ ROCm and JAX versions.
|
||||
devices.
|
||||
- 0.3.20
|
||||
- 5.1.0
|
||||
* - ``jax.dlpack``
|
||||
- For exchanging tensor data between JAX and other libraries that support the
|
||||
DLPack standard.
|
||||
- 0.1.57
|
||||
- 5.0.0
|
||||
* - ``jax.distributed``
|
||||
- Enables the scaling of computations across multiple devices on a single
|
||||
machine or across multiple machines.
|
||||
- 0.1.74
|
||||
- 5.0.0
|
||||
* - ``jax.dtypes``
|
||||
- Provides utilities for working with and managing data types in JAX
|
||||
arrays and computations.
|
||||
- 0.1.66
|
||||
- 5.0.0
|
||||
* - ``jax.image``
|
||||
- Contains image manipulation functions like resize, scale and translation.
|
||||
- 0.1.57
|
||||
@@ -283,27 +311,10 @@ ROCm and JAX versions.
|
||||
array.
|
||||
- 0.1.57
|
||||
- 5.0.0
|
||||
* - ``jax.profiler``
|
||||
- Contains JAX’s tracing and time profiling features.
|
||||
- 0.1.57
|
||||
- 5.0.0
|
||||
* - ``jax.stages``
|
||||
- Contains interfaces to stages of the compiled execution process.
|
||||
- 0.3.4
|
||||
- 5.0.0
|
||||
* - ``jax.tree``
|
||||
- Provides utilities for working with tree-like container data structures.
|
||||
- 0.4.26
|
||||
- 5.6.0
|
||||
* - ``jax.tree_util``
|
||||
- Provides utilities for working with nested data structures, or
|
||||
``pytrees``.
|
||||
- 0.1.65
|
||||
- 5.0.0
|
||||
* - ``jax.typing``
|
||||
- Provides JAX-specific static type annotations.
|
||||
- 0.3.18
|
||||
- 5.1.0
|
||||
* - ``jax.extend``
|
||||
- Provides modules for access to JAX internal machinery module. The
|
||||
``jax.extend`` module defines a library view of some of JAX’s internal
|
||||
@@ -339,8 +350,8 @@ A SciPy-like API for scientific computing.
|
||||
:header-rows: 1
|
||||
|
||||
* - Module
|
||||
- Since JAX
|
||||
- Since ROCm
|
||||
- As of JAX
|
||||
- As of ROCm
|
||||
* - ``jax.scipy.cluster``
|
||||
- 0.3.11
|
||||
- 5.1.0
|
||||
@@ -385,8 +396,8 @@ jax.scipy.stats module
|
||||
:header-rows: 1
|
||||
|
||||
* - Module
|
||||
- Since JAX
|
||||
- Since ROCm
|
||||
- As of JAX
|
||||
- As of ROCm
|
||||
* - ``jax.scipy.stats.bernouli``
|
||||
- 0.1.56
|
||||
- 5.0.0
|
||||
@@ -469,8 +480,8 @@ Modules for JAX extensions.
|
||||
:header-rows: 1
|
||||
|
||||
* - Module
|
||||
- Since JAX
|
||||
- Since ROCm
|
||||
- As of JAX
|
||||
- As of ROCm
|
||||
* - ``jax.extend.ffi``
|
||||
- 0.4.30
|
||||
- 6.0.0
|
||||
@@ -484,190 +495,25 @@ Modules for JAX extensions.
|
||||
- 0.4.15
|
||||
- 5.5.0
|
||||
|
||||
jax.experimental module
|
||||
-------------------------------------------------------------------------------
|
||||
|
||||
Experimental modules and APIs.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Module
|
||||
- Since JAX
|
||||
- Since ROCm
|
||||
* - ``jax.experimental.checkify``
|
||||
- 0.1.75
|
||||
- 5.0.0
|
||||
* - ``jax.experimental.compilation_cache.compilation_cache``
|
||||
- 0.1.68
|
||||
- 5.0.0
|
||||
* - ``jax.experimental.custom_partitioning``
|
||||
- 0.4.0
|
||||
- 5.3.0
|
||||
* - ``jax.experimental.jet``
|
||||
- 0.1.56
|
||||
- 5.0.0
|
||||
* - ``jax.experimental.key_reuse``
|
||||
- 0.4.26
|
||||
- 5.6.0
|
||||
* - ``jax.experimental.mesh_utils``
|
||||
- 0.1.76
|
||||
- 5.0.0
|
||||
* - ``jax.experimental.multihost_utils``
|
||||
- 0.3.2
|
||||
- 5.0.0
|
||||
* - ``jax.experimental.pallas``
|
||||
- 0.4.15
|
||||
- 5.5.0
|
||||
* - ``jax.experimental.pjit``
|
||||
- 0.1.61
|
||||
- 5.0.0
|
||||
* - ``jax.experimental.serialize_executable``
|
||||
- 0.4.0
|
||||
- 5.3.0
|
||||
* - ``jax.experimental.shard_map``
|
||||
- 0.4.3
|
||||
- 5.3.0
|
||||
* - ``jax.experimental.sparse``
|
||||
- 0.1.75
|
||||
- 5.0.0
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - API
|
||||
- Since JAX
|
||||
- Since ROCm
|
||||
* - ``jax.experimental.enable_x64``
|
||||
- 0.1.60
|
||||
- 5.0.0
|
||||
* - ``jax.experimental.disable_x64``
|
||||
- 0.1.60
|
||||
- 5.0.0
|
||||
|
||||
jax.experimental.pallas module
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Module for Pallas, a JAX extension for custom kernels.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Module
|
||||
- Since JAX
|
||||
- Since ROCm
|
||||
* - ``jax.experimental.pallas.mosaic_gpu``
|
||||
- 0.4.31
|
||||
- 6.1.3
|
||||
* - ``jax.experimental.pallas.tpu``
|
||||
- 0.4.15
|
||||
- 5.5.0
|
||||
* - ``jax.experimental.pallas.triton``
|
||||
- 0.4.32
|
||||
- 6.1.3
|
||||
|
||||
jax.experimental.sparse module
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Experimental support for sparse matrix operations.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Module
|
||||
- Since JAX
|
||||
- Since ROCm
|
||||
* - ``jax.experimental.sparse.linalg``
|
||||
- 0.3.15
|
||||
- 5.2.0
|
||||
* - ``jax.experimental.sparse.sparsify``
|
||||
- 0.3.25
|
||||
- ❌
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ``sparse`` data structure API
|
||||
- Since JAX
|
||||
- Since ROCm
|
||||
* - ``jax.experimental.sparse.BCOO``
|
||||
- 0.1.72
|
||||
- 5.0.0
|
||||
* - ``jax.experimental.sparse.BCSR``
|
||||
- 0.3.20
|
||||
- 5.1.0
|
||||
* - ``jax.experimental.sparse.CSR``
|
||||
- 0.1.75
|
||||
- 5.0.0
|
||||
* - ``jax.experimental.sparse.NM``
|
||||
- 0.4.27
|
||||
- 5.6.0
|
||||
* - ``jax.experimental.sparse.COO``
|
||||
- 0.1.75
|
||||
- 5.0.0
|
||||
|
||||
Unsupported JAX features
|
||||
------------------------
|
||||
===============================================================================
|
||||
|
||||
The following are GPU-accelerated JAX features not currently supported by
|
||||
ROCm.
|
||||
The following GPU-accelerated JAX features are not supported by ROCm for
|
||||
the listed supported JAX versions.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Feature
|
||||
- Description
|
||||
- Since JAX
|
||||
|
||||
* - Mixed Precision with TF32
|
||||
- Mixed precision with TF32 is used for matrix multiplications,
|
||||
convolutions, and other linear algebra operations, particularly in
|
||||
deep learning workloads like CNNs and transformers.
|
||||
- 0.2.25
|
||||
* - RNN support
|
||||
- Currently only LSTM with double bias is supported with float32 input
|
||||
and weight.
|
||||
- 0.3.25
|
||||
|
||||
* - XLA int4 support
|
||||
- 4-bit integer (int4) precision in the XLA compiler.
|
||||
- 0.4.0
|
||||
* - ``jax.experimental.sparsify``
|
||||
- Converts a dense matrix to a sparse matrix representation.
|
||||
- Experimental
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* The `nanoGPT in JAX <https://rocm.blogs.amd.com/artificial-intelligence/nanoGPT-JAX/README.html>`_
|
||||
blog explores the implementation and training of a Generative Pre-trained
|
||||
Transformer (GPT) model in JAX, inspired by Andrej Karpathy’s PyTorch-based
|
||||
nanoGPT. By comparing how essential GPT components—such as self-attention
|
||||
mechanisms and optimizers—are realized in PyTorch and JAX, also highlight
|
||||
JAX’s unique features.
|
||||
|
||||
* The `Optimize GPT Training: Enabling Mixed Precision Training in JAX using
|
||||
ROCm on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/jax-mixed-precision/README.html>`_
|
||||
blog post provides a comprehensive guide on enhancing the training efficiency
|
||||
of GPT models by implementing mixed precision techniques in JAX, specifically
|
||||
tailored for AMD GPUs utilizing the ROCm platform.
|
||||
|
||||
* The `Supercharging JAX with Triton Kernels on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/jax-triton/README.html>`_
|
||||
blog demonstrates how to develop a custom fused dropout-activation kernel for
|
||||
matrices using Triton, integrate it with JAX, and benchmark its performance
|
||||
using ROCm.
|
||||
|
||||
* The `Distributed fine-tuning with JAX on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/distributed-sft-jax/README.html>`_
|
||||
outlines the process of fine-tuning a Bidirectional Encoder Representations
|
||||
from Transformers (BERT)-based large language model (LLM) using JAX for a text
|
||||
classification task. The blog post discuss techniques for parallelizing the
|
||||
fine-tuning across multiple AMD GPUs and assess the model's performance on a
|
||||
holdout dataset. During the fine-tuning, a BERT-base-cased transformer model
|
||||
and the General Language Understanding Evaluation (GLUE) benchmark dataset was
|
||||
used on a multi-GPU setup.
|
||||
|
||||
* The `MI300X workload optimization guide <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html>`_
|
||||
provides detailed guidance on optimizing workloads for the AMD Instinct MI300X
|
||||
accelerator using ROCm. The page is aimed at helping users achieve optimal
|
||||
performance for deep learning and other high-performance computing tasks on
|
||||
the MI300X GPU.
|
||||
|
||||
For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.blogs.amd.com/blog/tag/jax.html>`_.
|
||||
* - MOSAIC (GPU)
|
||||
- Mosaic is a library of kernel-building abstractions for JAX's Pallas system
|
||||
|
||||
@@ -51,6 +51,8 @@ article_pages = [
|
||||
{"file": "how-to/deep-learning-rocm", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
|
||||
@@ -67,7 +69,6 @@ article_pages = [
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/install", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/vllm-benchmark", "os": ["linux"]},
|
||||
|
||||
@@ -62,47 +62,52 @@ PyTorch inference performance testing
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Getting started
|
||||
===============
|
||||
System validation
|
||||
=================
|
||||
|
||||
Use the following procedures to reproduce the benchmark results on an
|
||||
MI300X series accelerator with the prebuilt PyTorch Docker image.
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
.. _pytorch-benchmark-get-started:
|
||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||
might hang until the periodic balancing is finalized. For more information,
|
||||
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
|
||||
|
||||
1. Disable NUMA auto-balancing.
|
||||
.. code-block:: shell
|
||||
|
||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||
might hang until the periodic balancing is finalized. For more information,
|
||||
see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
|
||||
# disable automatic NUMA balancing
|
||||
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||
cat /proc/sys/kernel/numa_balancing
|
||||
0
|
||||
|
||||
.. code-block:: shell
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
# disable automatic NUMA balancing
|
||||
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||
cat /proc/sys/kernel/numa_balancing
|
||||
0
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
.. container:: model-doc pyt_chai1_inference
|
||||
|
||||
2. Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue/images/sha256-b736a4239ab38a9d0e448af6d4adca83b117debed00bfbe33846f99c4540f79b>`_ from Docker Hub.
|
||||
Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue/images/sha256-b736a4239ab38a9d0e448af6d4adca83b117debed00bfbe33846f99c4540f79b>`_ from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue
|
||||
docker pull rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue
|
||||
|
||||
.. note::
|
||||
.. note::
|
||||
|
||||
The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
|
||||
The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
|
||||
|
||||
.. container:: model-doc pyt_clip_inference
|
||||
|
||||
2. Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.
|
||||
Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/pytorch:latest
|
||||
docker pull rocm/pytorch:latest
|
||||
|
||||
.. _pytorch-benchmark-get-started:
|
||||
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
@@ -111,35 +111,37 @@ vLLM inference performance testing
|
||||
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||
see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.
|
||||
|
||||
Getting started
|
||||
===============
|
||||
System validation
|
||||
=================
|
||||
|
||||
Use the following procedures to reproduce the benchmark results on an
|
||||
MI300X accelerator with the prebuilt vLLM Docker image.
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
.. _vllm-benchmark-get-started:
|
||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||
might hang until the periodic balancing is finalized. For more information,
|
||||
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
|
||||
|
||||
1. Disable NUMA auto-balancing.
|
||||
.. code-block:: shell
|
||||
|
||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||
might hang until the periodic balancing is finalized. For more information,
|
||||
see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
|
||||
# disable automatic NUMA balancing
|
||||
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||
cat /proc/sys/kernel/numa_balancing
|
||||
0
|
||||
|
||||
.. code-block:: shell
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
# disable automatic NUMA balancing
|
||||
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||
cat /proc/sys/kernel/numa_balancing
|
||||
0
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
2. Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
||||
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
.. code-block:: shell
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ unified_docker.pull_tag }}
|
||||
docker pull {{ unified_docker.pull_tag }}
|
||||
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
@@ -30,7 +30,7 @@ ROCm supports multiple :doc:`installation methods <rocm-install-on-linux:install
|
||||
|
||||
* :doc:`Using the AMDGPU installer <rocm-install-on-linux:install/amdgpu-install>`
|
||||
|
||||
* :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`.
|
||||
* :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`
|
||||
|
||||
.. grid:: 1
|
||||
|
||||
@@ -59,4 +59,8 @@ images with the framework pre-installed.
|
||||
|
||||
* :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
|
||||
|
||||
The sections that follow in :doc:`Training a model <../training/train-a-model>` are geared for a ROCm with PyTorch installation.
|
||||
Next steps
|
||||
==========
|
||||
|
||||
After installing ROCm and your desired ML libraries -- and before running AI workloads -- conduct system health benchmarks
|
||||
to test the optimal performance of your AMD hardware. See :doc:`system-health-check` to get started.
|
||||
104
docs/how-to/rocm-for-ai/system-health-check.rst
Normal file
104
docs/how-to/rocm-for-ai/system-health-check.rst
Normal file
@@ -0,0 +1,104 @@
|
||||
.. meta::
|
||||
:description: System health checks with RVS, RCCL tests, BabelStream, and TransferBench to validate AMD hardware performance running AI workloads.
|
||||
:keywords: gpu, accelerator, system, health, validation, bench, perf, performance, rvs, rccl, babel, mi300x, mi325x, flops, bandwidth, rbt, training, inference
|
||||
|
||||
.. _rocm-for-ai-system-health-bench:
|
||||
|
||||
************************
|
||||
System health benchmarks
|
||||
************************
|
||||
|
||||
Before running AI workloads, it is important to validate that your AMD hardware is configured correctly and is performing optimally. This topic outlines several system health benchmarks you can use to test key aspects like GPU compute capabilities (FLOPS), memory bandwidth, and interconnect performance. Many of these tests are part of the ROCm Validation Suite (RVS).
|
||||
|
||||
ROCm Validation Suite (RVS) tests
|
||||
=================================
|
||||
|
||||
RVS provides a collection of tests, benchmarks, and qualification tools, each
|
||||
targeting a specific subsystem of the system under test. It includes tests for
|
||||
GPU stress and memory bandwidth.
|
||||
|
||||
.. _healthcheck-install-rvs:
|
||||
|
||||
Install ROCm Validation Suite
|
||||
-----------------------------
|
||||
|
||||
To get started, install RVS. For example, on an Ubuntu system with ROCm already
|
||||
installed, run the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt update
|
||||
sudo apt install rocm-validation-suite
|
||||
|
||||
See the `ROCm Validation Suite installation instructions <https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/install/installation.html>`_,
|
||||
and `System validation tests <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#system-validation-tests>`_
|
||||
in the Instinct documentation for more detailed instructions.
|
||||
|
||||
Benchmark, stress, and qualification tests
|
||||
------------------------------------------
|
||||
|
||||
The GPU stress test runs various GEMM computations as workloads to stress the GPU FLOPS performance and check whether it
|
||||
meets the configured target GFLOPS.
|
||||
|
||||
Run the benchmark, stress, and qualification tests included with RVS. See the `Benchmark, stress, qualification
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#benchmark-stress-qualification>`_
|
||||
section of the Instinct documentation for usage instructions.
|
||||
|
||||
BabelStream test
|
||||
----------------
|
||||
|
||||
BabelStream is a synthetic GPU benchmark based on the STREAM benchmark for
|
||||
CPUs, measuring memory transfer rates to and from global device memory.
|
||||
BabelStream tests are included with the RVS package as part of the `BABEL module
|
||||
<https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/conceptual/rvs-modules.html#babel-benchmark-test-babel-module>`_.
|
||||
|
||||
For more information, see `Performance benchmarking
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#babelstream-benchmarking-results>`_
|
||||
in the Instinct documentation.
|
||||
|
||||
RCCL tests
|
||||
==========
|
||||
|
||||
The ROCm Communication Collectives Library (RCCL) enables efficient multi-GPU
|
||||
communication. The `<https://github.com/ROCm/rccl-tests>`__ suite benchmarks
|
||||
the performance and verifies the correctness of these collective operations.
|
||||
This helps ensure optimal scaling for multi-accelerator tasks.
|
||||
|
||||
1. To get started, build RCCL-tests using the official instructions in the README at
|
||||
`<https://github.com/ROCm/rccl-tests?tab=readme-ov-file#build>`__ or use the
|
||||
following commands:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/rccl-tests.git
|
||||
cd rccl-tests
|
||||
make
|
||||
|
||||
2. Run the suggested RCCL tests -- see `RCCL benchmarking
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#rccl-benchmarking-results>`_
|
||||
in the Instinct performance benchmarking documentation for instructions.
|
||||
|
||||
TransferBench test
|
||||
==================
|
||||
|
||||
TransferBench is a standalone utility for benchmarking simultaneous data
|
||||
transfer performance between various devices in the system, including
|
||||
CPU-to-GPU and GPU-to-GPU (peer-to-peer). This helps identify potential
|
||||
bottlenecks in data movement between the host system and the GPUs, or between
|
||||
GPUs, which can impact end-to-end latency.
|
||||
|
||||
.. _healthcheck-install-transferbench:
|
||||
|
||||
1. To get started, use the instructions in the `TransferBench documentation
|
||||
<https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`_
|
||||
or use the following commands:
|
||||
|
||||
.. code:: shell
|
||||
|
||||
git clone https://github.com/ROCm/TransferBench.git
|
||||
cd TransferBench
|
||||
CC=hipcc make
|
||||
|
||||
2. Run the suggested TransferBench tests -- see `TransferBench benchmarking
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#transferbench-benchmarking-results>`_
|
||||
in the Instinct performance benchmarking documentation for instructions.
|
||||
@@ -79,11 +79,18 @@ across different input sequences. Support for packed input format is planned for
|
||||
System validation
|
||||
=================
|
||||
|
||||
If you have already validated your system settings, including NUMA
|
||||
auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
|
||||
and optimization steps <train-a-model-system-validation>` to set up your system
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
|
||||
@@ -175,8 +182,8 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
|
||||
|
||||
.. _amd-maxtext-download-docker:
|
||||
|
||||
Download the Docker image
|
||||
-------------------------
|
||||
Pull the Docker image
|
||||
---------------------
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
|
||||
@@ -103,11 +103,18 @@ popular AI models.
|
||||
System validation
|
||||
=================
|
||||
|
||||
If you have already validated your system settings, including NUMA
|
||||
auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
|
||||
and optimization steps <train-a-model-system-validation>` to set up your system
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
.. _mi300x-amd-megatron-lm-training:
|
||||
|
||||
Environment setup
|
||||
|
||||
@@ -34,11 +34,18 @@ for MPT-30B with access to detailed logs and performance metrics.
|
||||
System validation
|
||||
=================
|
||||
|
||||
If you have already validated your system settings, including NUMA
|
||||
auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
|
||||
and optimization steps <train-a-model-system-validation>` to set up your system
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
Getting started
|
||||
===============
|
||||
|
||||
|
||||
@@ -77,11 +77,18 @@ popular AI models.
|
||||
System validation
|
||||
=================
|
||||
|
||||
If you have already validated your system settings, including NUMA
|
||||
auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
|
||||
and optimization steps <train-a-model-system-validation>` to set up your system
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t validate configurations and run conditions outside those described.
|
||||
|
||||
@@ -21,8 +21,12 @@ In this guide, you'll learn about:
|
||||
|
||||
- Training a model
|
||||
|
||||
- :doc:`Train a model with Megatron-LM <benchmark-docker/megatron-lm>`
|
||||
- :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`
|
||||
|
||||
- :doc:`Train a model with PyTorch <benchmark-docker/pytorch-training>`
|
||||
- :doc:`With PyTorch <benchmark-docker/pytorch-training>`
|
||||
|
||||
- :doc:`With JAX MaxText <benchmark-docker/jax-maxtext>`
|
||||
|
||||
- :doc:`With LLM Foundry <benchmark-docker/mpt-llm-foundry>`
|
||||
|
||||
- :doc:`Scaling model training <scale-model-training>`
|
||||
|
||||
@@ -5,12 +5,13 @@
|
||||
:keywords: ROCm, AI, LLM, train, megatron, Llama, tutorial, docker, torch, pytorch, jax
|
||||
|
||||
.. _train-a-model-system-validation:
|
||||
.. _rocm-for-ai-system-optimization:
|
||||
|
||||
**********************************************
|
||||
Prerequisite system validation before training
|
||||
**********************************************
|
||||
**********************************************************
|
||||
Prerequisite system validation before running AI workloads
|
||||
**********************************************************
|
||||
|
||||
Complete the following system validation and optimization steps to set up your system before starting training.
|
||||
Complete the following system validation and optimization steps to set up your system before starting training and inference.
|
||||
|
||||
Disable NUMA auto-balancing
|
||||
---------------------------
|
||||
@@ -26,7 +27,8 @@ the output is ``1``, run the following command to disable NUMA auto-balancing.
|
||||
|
||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
|
||||
See :ref:`mi300x-disable-numa` for more information.
|
||||
See `Disable NUMA auto-balancing <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#disable-numa-auto-balancing>`_
|
||||
in the Instinct documentation for more information.
|
||||
|
||||
Hardware verification with ROCm
|
||||
-------------------------------
|
||||
@@ -42,7 +44,8 @@ Run the command:
|
||||
|
||||
rocm-smi --setperfdeterminism 1900
|
||||
|
||||
See :ref:`mi300x-hardware-verification-with-rocm` for more information.
|
||||
See `Hardware verfication for ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_
|
||||
in the Instinct documentation for more information.
|
||||
|
||||
RCCL Bandwidth Test for multi-node setups
|
||||
-----------------------------------------
|
||||
|
||||
@@ -1,68 +0,0 @@
|
||||
---
|
||||
myst:
|
||||
html_meta:
|
||||
"description": "Learn more about common system-level debugging measures for ROCm."
|
||||
"keywords": "env, var, sys, PCIe, troubleshooting, admin, error"
|
||||
---
|
||||
|
||||
# System debugging
|
||||
|
||||
## ROCm language and system-level debug, flags, and environment variables
|
||||
|
||||
Kernel options to avoid: the Ethernet port getting renamed every time you change graphics cards, `net.ifnames=0 biosdevname=0`
|
||||
|
||||
## ROCr error code
|
||||
|
||||
* 2 Invalid Dimension
|
||||
* 4 Invalid Group Memory
|
||||
* 8 Invalid (or Null) Code
|
||||
* 32 Invalid Format
|
||||
* 64 Group is too large
|
||||
* 128 Out of VGPRs
|
||||
* 0x80000000 Debug Options
|
||||
|
||||
## Command to dump firmware version and get Linux kernel version
|
||||
|
||||
`sudo cat /sys/kernel/debug/dri/1/amdgpu_firmware_info`
|
||||
|
||||
`uname -a`
|
||||
|
||||
## Debug flags
|
||||
|
||||
Debug messages when developing/debugging base ROCm driver. You could enable the printing from `libhsakmt.so` by setting an environment variable, `HSAKMT_DEBUG_LEVEL`. Available debug levels are 3-7. The higher level you set, the more messages will print.
|
||||
|
||||
* `export HSAKMT_DEBUG_LEVEL=3` : Only pr_err() prints.
|
||||
|
||||
* `export HSAKMT_DEBUG_LEVEL=4` : pr_err() and pr_warn() print.
|
||||
|
||||
* `export HSAKMT_DEBUG_LEVEL=5` : We currently do not implement “notice”. Setting to 5 is same as setting to 4.
|
||||
|
||||
* `export HSAKMT_DEBUG_LEVEL=6` : pr_err(), pr_warn(), and pr_info print.
|
||||
|
||||
* `export HSAKMT_DEBUG_LEVEL=7` : Everything including pr_debug prints.
|
||||
|
||||
## ROCr level environment variables for debug
|
||||
|
||||
`HSA_ENABLE_SDMA=0`
|
||||
|
||||
`HSA_ENABLE_INTERRUPT=0`
|
||||
|
||||
`HSA_SVM_GUARD_PAGES=0`
|
||||
|
||||
`HSA_DISABLE_CACHE=1`
|
||||
|
||||
## Turn off page retry on GFX9/Vega devices
|
||||
|
||||
`sudo -s`
|
||||
|
||||
`echo 1 > /sys/module/amdkfd/parameters/noretry`
|
||||
|
||||
## HIP environment variables 3.x
|
||||
|
||||
### OpenCL debug flags
|
||||
|
||||
`AMD_OCL_WAIT_COMMAND=1 (0 = OFF, 1 = On)`
|
||||
|
||||
## PCIe-debug
|
||||
|
||||
For information on how to debug and profile HIP applications, see {doc}`hip:how-to/debugging`
|
||||
@@ -42,7 +42,6 @@ ROCm documentation is organized into the following categories:
|
||||
* [Use ROCm for HPC](./how-to/rocm-for-hpc/index.rst)
|
||||
* [System optimization](./how-to/system-optimization/index.rst)
|
||||
* [AMD Instinct MI300X performance validation and tuning](./how-to/tuning-guides/mi300x/index.rst)
|
||||
* [System debugging](./how-to/system-debugging.md)
|
||||
* [Use advanced compiler features](./conceptual/compiler-topics.md)
|
||||
* [Set the number of CUs](./how-to/setting-cus)
|
||||
* [Troubleshoot BAR access limitation](./how-to/Bar-Memory.rst)
|
||||
|
||||
@@ -36,6 +36,10 @@ subtrees:
|
||||
title: Use ROCm for AI
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: how-to/rocm-for-ai/install.rst
|
||||
title: Installation
|
||||
- file: how-to/rocm-for-ai/system-health-check.rst
|
||||
title: System health benchmarks
|
||||
- file: how-to/rocm-for-ai/training/index.rst
|
||||
title: Training
|
||||
subtrees:
|
||||
@@ -70,8 +74,6 @@ subtrees:
|
||||
title: Inference
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: how-to/rocm-for-ai/inference/install.rst
|
||||
title: Installation
|
||||
- file: how-to/rocm-for-ai/inference/hugging-face-models.rst
|
||||
title: Run models from Hugging Face
|
||||
- file: how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
|
||||
@@ -107,7 +109,6 @@ subtrees:
|
||||
title: System optimization
|
||||
- file: how-to/gpu-performance/mi300x.rst
|
||||
title: AMD Instinct MI300X performance guides
|
||||
- file: how-to/system-debugging.md
|
||||
- file: conceptual/compiler-topics.md
|
||||
title: Use advanced compiler features
|
||||
subtrees:
|
||||
@@ -121,7 +122,7 @@ subtrees:
|
||||
- file: how-to/setting-cus
|
||||
title: Set the number of CUs
|
||||
- file: how-to/Bar-Memory.rst
|
||||
title: Troubleshoot BAR access limitation
|
||||
title: Troubleshoot BAR access limitation
|
||||
- url: https://github.com/amd/rocm-examples
|
||||
title: ROCm examples
|
||||
|
||||
|
||||
Reference in New Issue
Block a user