mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-13 08:38:04 -05:00
Compare commits
1 Commits
cpattigi-p
...
jax_fix
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
12d7b43317 |
@@ -77,8 +77,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: clr
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
|
||||
cmakeBuildDir: 'clr/build'
|
||||
extraBuildFlags: >-
|
||||
-DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
|
||||
-DHIP_PLATFORM=amd
|
||||
@@ -139,8 +138,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: clr
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
|
||||
cmakeBuildDir: 'clr/build'
|
||||
extraBuildFlags: >-
|
||||
-DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
|
||||
-DHIP_PLATFORM=nvidia
|
||||
|
||||
@@ -73,7 +73,6 @@ jobs:
|
||||
parameters:
|
||||
componentName: upstream-llvm
|
||||
cmakeBuildDir: $(Pipeline.Workspace)/llvm-project/llvm/build
|
||||
cmakeSourceDir: $(Pipeline.Workspace)/llvm-project/llvm
|
||||
installDir: $(Pipeline.Workspace)/llvm
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
@@ -118,7 +118,6 @@ jobs:
|
||||
parameters:
|
||||
componentName: extras
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/aomp-extras/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/aomp-extras'
|
||||
installDir: '$(Build.BinariesDirectory)/llvm'
|
||||
extraBuildFlags: >-
|
||||
-DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm
|
||||
@@ -130,7 +129,6 @@ jobs:
|
||||
parameters:
|
||||
componentName: openmp
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/openmp/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/openmp'
|
||||
installDir: '$(Build.BinariesDirectory)/llvm'
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
|
||||
@@ -157,7 +155,6 @@ jobs:
|
||||
parameters:
|
||||
componentName: offload
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/offload/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/offload'
|
||||
installDir: '$(Build.BinariesDirectory)/llvm'
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
|
||||
|
||||
@@ -92,8 +92,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: external
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/deps/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/deps'
|
||||
cmakeBuildDir: 'deps/build'
|
||||
installDir: '$(Pipeline.Workspace)/deps-install'
|
||||
extraBuildFlags: >-
|
||||
-DBUILD_BOOST=OFF
|
||||
|
||||
@@ -83,8 +83,7 @@ jobs:
|
||||
-DROCM_LLVM_BACKWARD_COMPAT_LINK=$(Build.BinariesDirectory)/llvm
|
||||
-DROCM_LLVM_BACKWARD_COMPAT_LINK_TARGET=./lib/llvm
|
||||
-GNinja
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/llvm/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/llvm'
|
||||
cmakeBuildDir: 'llvm/build'
|
||||
installDir: '$(Build.BinariesDirectory)/llvm'
|
||||
# use llvm-lit to run unit tests for llvm, clang, and lld
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
@@ -122,8 +121,7 @@ jobs:
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build"
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/amd/device-libs/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/amd/device-libs'
|
||||
cmakeBuildDir: 'amd/device-libs/build'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: comgr
|
||||
@@ -131,8 +129,7 @@ jobs:
|
||||
-DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build;$(Build.SourcesDirectory)/amd/device-libs/build"
|
||||
-DCOMGR_DISABLE_SPIRV=1
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/amd/comgr/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/amd/comgr'
|
||||
cmakeBuildDir: 'amd/comgr/build'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: comgr
|
||||
@@ -145,8 +142,7 @@ jobs:
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DHIPCC_BACKWARD_COMPATIBILITY=OFF
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/amd/hipcc/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/amd/hipcc'
|
||||
cmakeBuildDir: 'amd/hipcc/build'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
|
||||
@@ -105,7 +105,6 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
cmakeBuildDir: $(Build.SourcesDirectory)/grpc/build
|
||||
cmakeSourceDir: $(Build.SourcesDirectory)/grpc
|
||||
installDir: $(Build.SourcesDirectory)/bin
|
||||
extraBuildFlags: >-
|
||||
-DgRPC_INSTALL=ON
|
||||
|
||||
@@ -125,7 +125,6 @@ jobs:
|
||||
parameters:
|
||||
componentName: PyBind11
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/pybind11/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/pybind11'
|
||||
customInstallPath: false
|
||||
installEnabled: false
|
||||
extraBuildFlags: >-
|
||||
@@ -142,7 +141,6 @@ jobs:
|
||||
parameters:
|
||||
componentName: RapidJSON
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/rapidjson/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/rapidjson'
|
||||
customInstallPath: false
|
||||
installEnabled: false
|
||||
extraBuildFlags: >-
|
||||
@@ -202,6 +200,7 @@ jobs:
|
||||
value: $(Agent.BuildDirectory)/rocm/include/rocal
|
||||
pool:
|
||||
name: ${{ job.target }}_test_pool
|
||||
demands: firstRenderDeviceAccess
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
|
||||
@@ -108,6 +108,7 @@ jobs:
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
pool:
|
||||
name: ${{ job.target }}_test_pool
|
||||
demands: firstRenderDeviceAccess
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
|
||||
@@ -114,6 +114,7 @@ jobs:
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
pool:
|
||||
name: ${{ job.target }}_test_pool
|
||||
demands: firstRenderDeviceAccess
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
|
||||
@@ -5,12 +5,6 @@ parameters:
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
- name: sparseCheckout
|
||||
type: boolean
|
||||
default: false
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -72,8 +66,6 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckout: ${{ parameters.sparseCheckout }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
|
||||
@@ -168,6 +168,7 @@ jobs:
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
pool:
|
||||
name: ${{ job.target }}_test_pool
|
||||
demands: firstRenderDeviceAccess
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
|
||||
@@ -105,7 +105,6 @@ jobs:
|
||||
-DLAPACKE=OFF
|
||||
-GNinja
|
||||
cmakeBuildDir: '$(Build.SourcesDirectory)/lapack/build'
|
||||
cmakeSourceDir: '$(Build.SourcesDirectory)/lapack'
|
||||
installDir: '$(Pipeline.Workspace)/deps-install'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
|
||||
@@ -167,6 +167,7 @@ jobs:
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
pool:
|
||||
name: ${{ job.target }}_test_pool
|
||||
demands: firstRenderDeviceAccess
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
|
||||
@@ -38,7 +38,6 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
cmakeBuildDir: $(Agent.BuildDirectory)/grpc/build
|
||||
cmakeSourceDir: $(Agent.BuildDirectory)/grpc
|
||||
extraBuildFlags: >-
|
||||
-DgRPC_INSTALL=ON
|
||||
-DgRPC_BUILD_TESTS=OFF
|
||||
|
||||
@@ -38,7 +38,6 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
cmakeBuildDir: $(Agent.BuildDirectory)/googletest/build
|
||||
cmakeSourceDir: $(Agent.BuildDirectory)/googletest
|
||||
extraBuildFlags: >-
|
||||
-DGTEST_FORCE_SHARED_CRT=ON
|
||||
-DCMAKE_DEBUG_POSTFIX=d
|
||||
|
||||
@@ -10,10 +10,10 @@ parameters:
|
||||
default: ''
|
||||
- name: cmakeBuildDir
|
||||
type: string
|
||||
default: $(Agent.BuildDirectory)/s/build
|
||||
default: 'build'
|
||||
- name: cmakeSourceDir
|
||||
type: string
|
||||
default: $(Agent.BuildDirectory)/s
|
||||
default: '..'
|
||||
- name: customBuildTarget
|
||||
type: string
|
||||
default: ''
|
||||
@@ -46,7 +46,7 @@ steps:
|
||||
${{ if eq(parameters.customInstallPath, true) }}:
|
||||
cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
|
||||
${{ else }}:
|
||||
cmakeArgs: ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
|
||||
cmakeArgs: ${{ parameters.extraBuildFlags }} ..
|
||||
- ${{ if parameters.printDiskSpace }}:
|
||||
- script: df -h
|
||||
displayName: Disk space before build
|
||||
|
||||
@@ -4,12 +4,6 @@ parameters:
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: sparseCheckout
|
||||
type: boolean
|
||||
default: false
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
# submodule download behaviour
|
||||
# change to 'recursive' for repos with submodules
|
||||
- name: submoduleBehaviour
|
||||
@@ -21,13 +15,3 @@ steps:
|
||||
clean: true
|
||||
submodules: ${{ parameters.submoduleBehaviour }}
|
||||
retryCountOnTaskFailure: 3
|
||||
fetchFilter: blob:none
|
||||
${{ if eq(parameters.sparseCheckout, true) }}:
|
||||
sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }}
|
||||
path: sparse
|
||||
- ${{ if eq(parameters.sparseCheckout, true) }}:
|
||||
- task: Bash@3
|
||||
displayName: Symlink sparse checkout
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: ln -s $(Agent.BuildDirectory)/sparse/${{ parameters.sparseCheckoutDir }} $(Agent.BuildDirectory)/s
|
||||
|
||||
@@ -106,7 +106,6 @@ parameters:
|
||||
type: object
|
||||
default:
|
||||
- gfx90a
|
||||
- gfx942
|
||||
|
||||
steps:
|
||||
# these steps should only be run if there was a failure or warning
|
||||
|
||||
@@ -34,7 +34,6 @@ Autocast
|
||||
BARs
|
||||
BLAS
|
||||
BMC
|
||||
BabelStream
|
||||
Blit
|
||||
Blockwise
|
||||
Bluefield
|
||||
@@ -139,7 +138,6 @@ GDR
|
||||
GDS
|
||||
GEMM
|
||||
GEMMs
|
||||
GFLOPS
|
||||
GFortran
|
||||
GFXIP
|
||||
Gemma
|
||||
@@ -643,7 +641,6 @@ hipSPARSELt
|
||||
hipTensor
|
||||
hipamd
|
||||
hipblas
|
||||
hipcc
|
||||
hipcub
|
||||
hipfft
|
||||
hipfort
|
||||
|
||||
@@ -127,7 +127,6 @@ bash install-prerequisites.sh
|
||||
export GPU_ARCHS="gfx942" # Example
|
||||
export GPU_ARCHS="gfx940;gfx941;gfx942" # Example
|
||||
|
||||
cd ~/WORKSPACE/
|
||||
# Pick and run build commands in the docker container:
|
||||
# Build rocm-dev packages
|
||||
make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} rocm-dev
|
||||
|
||||
@@ -496,7 +496,7 @@ Modules for JAX extensions.
|
||||
- 5.5.0
|
||||
|
||||
Unsupported JAX features
|
||||
===============================================================================
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The following GPU-accelerated JAX features are not supported by ROCm for
|
||||
the listed supported JAX versions.
|
||||
|
||||
@@ -51,8 +51,6 @@ article_pages = [
|
||||
{"file": "how-to/deep-learning-rocm", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
|
||||
@@ -69,6 +67,7 @@ article_pages = [
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/install", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/vllm-benchmark", "os": ["linux"]},
|
||||
|
||||
@@ -30,7 +30,7 @@ ROCm supports multiple :doc:`installation methods <rocm-install-on-linux:install
|
||||
|
||||
* :doc:`Using the AMDGPU installer <rocm-install-on-linux:install/amdgpu-install>`
|
||||
|
||||
* :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`
|
||||
* :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`.
|
||||
|
||||
.. grid:: 1
|
||||
|
||||
@@ -59,8 +59,4 @@ images with the framework pre-installed.
|
||||
|
||||
* :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
|
||||
|
||||
Next steps
|
||||
==========
|
||||
|
||||
After installing ROCm and your desired ML libraries -- and before running AI workloads -- conduct system health benchmarks
|
||||
to test the optimal performance of your AMD hardware. See :doc:`system-health-check` to get started.
|
||||
The sections that follow in :doc:`Training a model <../training/train-a-model>` are geared for a ROCm with PyTorch installation.
|
||||
@@ -62,52 +62,47 @@ PyTorch inference performance testing
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
System validation
|
||||
=================
|
||||
Getting started
|
||||
===============
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
Use the following procedures to reproduce the benchmark results on an
|
||||
MI300X series accelerator with the prebuilt PyTorch Docker image.
|
||||
|
||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||
might hang until the periodic balancing is finalized. For more information,
|
||||
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
|
||||
.. _pytorch-benchmark-get-started:
|
||||
|
||||
.. code-block:: shell
|
||||
1. Disable NUMA auto-balancing.
|
||||
|
||||
# disable automatic NUMA balancing
|
||||
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||
cat /proc/sys/kernel/numa_balancing
|
||||
0
|
||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||
might hang until the periodic balancing is finalized. For more information,
|
||||
see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
.. code-block:: shell
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
# disable automatic NUMA balancing
|
||||
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||
cat /proc/sys/kernel/numa_balancing
|
||||
0
|
||||
|
||||
.. container:: model-doc pyt_chai1_inference
|
||||
|
||||
Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue/images/sha256-b736a4239ab38a9d0e448af6d4adca83b117debed00bfbe33846f99c4540f79b>`_ from Docker Hub.
|
||||
2. Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue/images/sha256-b736a4239ab38a9d0e448af6d4adca83b117debed00bfbe33846f99c4540f79b>`_ from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue
|
||||
docker pull rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue
|
||||
|
||||
.. note::
|
||||
.. note::
|
||||
|
||||
The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
|
||||
The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
|
||||
|
||||
.. container:: model-doc pyt_clip_inference
|
||||
|
||||
Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.
|
||||
2. Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/pytorch:latest
|
||||
|
||||
.. _pytorch-benchmark-get-started:
|
||||
docker pull rocm/pytorch:latest
|
||||
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
@@ -111,37 +111,35 @@ vLLM inference performance testing
|
||||
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||
see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.
|
||||
|
||||
System validation
|
||||
=================
|
||||
Getting started
|
||||
===============
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
Use the following procedures to reproduce the benchmark results on an
|
||||
MI300X accelerator with the prebuilt vLLM Docker image.
|
||||
|
||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||
might hang until the periodic balancing is finalized. For more information,
|
||||
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
|
||||
.. _vllm-benchmark-get-started:
|
||||
|
||||
.. code-block:: shell
|
||||
1. Disable NUMA auto-balancing.
|
||||
|
||||
# disable automatic NUMA balancing
|
||||
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||
cat /proc/sys/kernel/numa_balancing
|
||||
0
|
||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||
might hang until the periodic balancing is finalized. For more information,
|
||||
see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
.. code-block:: shell
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
# disable automatic NUMA balancing
|
||||
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||
cat /proc/sys/kernel/numa_balancing
|
||||
0
|
||||
|
||||
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
2. Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
||||
|
||||
.. code-block:: shell
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
docker pull {{ unified_docker.pull_tag }}
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ unified_docker.pull_tag }}
|
||||
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
@@ -1,104 +0,0 @@
|
||||
.. meta::
|
||||
:description: System health checks with RVS, RCCL tests, BabelStream, and TransferBench to validate AMD hardware performance running AI workloads.
|
||||
:keywords: gpu, accelerator, system, health, validation, bench, perf, performance, rvs, rccl, babel, mi300x, mi325x, flops, bandwidth, rbt, training, inference
|
||||
|
||||
.. _rocm-for-ai-system-health-bench:
|
||||
|
||||
************************
|
||||
System health benchmarks
|
||||
************************
|
||||
|
||||
Before running AI workloads, it is important to validate that your AMD hardware is configured correctly and is performing optimally. This topic outlines several system health benchmarks you can use to test key aspects like GPU compute capabilities (FLOPS), memory bandwidth, and interconnect performance. Many of these tests are part of the ROCm Validation Suite (RVS).
|
||||
|
||||
ROCm Validation Suite (RVS) tests
|
||||
=================================
|
||||
|
||||
RVS provides a collection of tests, benchmarks, and qualification tools, each
|
||||
targeting a specific subsystem of the system under test. It includes tests for
|
||||
GPU stress and memory bandwidth.
|
||||
|
||||
.. _healthcheck-install-rvs:
|
||||
|
||||
Install ROCm Validation Suite
|
||||
-----------------------------
|
||||
|
||||
To get started, install RVS. For example, on an Ubuntu system with ROCm already
|
||||
installed, run the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt update
|
||||
sudo apt install rocm-validation-suite
|
||||
|
||||
See the `ROCm Validation Suite installation instructions <https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/install/installation.html>`_,
|
||||
and `System validation tests <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#system-validation-tests>`_
|
||||
in the Instinct documentation for more detailed instructions.
|
||||
|
||||
Benchmark, stress, and qualification tests
|
||||
------------------------------------------
|
||||
|
||||
The GPU stress test runs various GEMM computations as workloads to stress the GPU FLOPS performance and check whether it
|
||||
meets the configured target GFLOPS.
|
||||
|
||||
Run the benchmark, stress, and qualification tests included with RVS. See the `Benchmark, stress, qualification
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#benchmark-stress-qualification>`_
|
||||
section of the Instinct documentation for usage instructions.
|
||||
|
||||
BabelStream test
|
||||
----------------
|
||||
|
||||
BabelStream is a synthetic GPU benchmark based on the STREAM benchmark for
|
||||
CPUs, measuring memory transfer rates to and from global device memory.
|
||||
BabelStream tests are included with the RVS package as part of the `BABEL module
|
||||
<https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/conceptual/rvs-modules.html#babel-benchmark-test-babel-module>`_.
|
||||
|
||||
For more information, see `Performance benchmarking
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#babelstream-benchmarking-results>`_
|
||||
in the Instinct documentation.
|
||||
|
||||
RCCL tests
|
||||
==========
|
||||
|
||||
The ROCm Communication Collectives Library (RCCL) enables efficient multi-GPU
|
||||
communication. The `<https://github.com/ROCm/rccl-tests>`__ suite benchmarks
|
||||
the performance and verifies the correctness of these collective operations.
|
||||
This helps ensure optimal scaling for multi-accelerator tasks.
|
||||
|
||||
1. To get started, build RCCL-tests using the official instructions in the README at
|
||||
`<https://github.com/ROCm/rccl-tests?tab=readme-ov-file#build>`__ or use the
|
||||
following commands:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/rccl-tests.git
|
||||
cd rccl-tests
|
||||
make
|
||||
|
||||
2. Run the suggested RCCL tests -- see `RCCL benchmarking
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#rccl-benchmarking-results>`_
|
||||
in the Instinct performance benchmarking documentation for instructions.
|
||||
|
||||
TransferBench test
|
||||
==================
|
||||
|
||||
TransferBench is a standalone utility for benchmarking simultaneous data
|
||||
transfer performance between various devices in the system, including
|
||||
CPU-to-GPU and GPU-to-GPU (peer-to-peer). This helps identify potential
|
||||
bottlenecks in data movement between the host system and the GPUs, or between
|
||||
GPUs, which can impact end-to-end latency.
|
||||
|
||||
.. _healthcheck-install-transferbench:
|
||||
|
||||
1. To get started, use the instructions in the `TransferBench documentation
|
||||
<https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`_
|
||||
or use the following commands:
|
||||
|
||||
.. code:: shell
|
||||
|
||||
git clone https://github.com/ROCm/TransferBench.git
|
||||
cd TransferBench
|
||||
CC=hipcc make
|
||||
|
||||
2. Run the suggested TransferBench tests -- see `TransferBench benchmarking
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#transferbench-benchmarking-results>`_
|
||||
in the Instinct performance benchmarking documentation for instructions.
|
||||
@@ -79,18 +79,11 @@ across different input sequences. Support for packed input format is planned for
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
If you have already validated your system settings, including NUMA
|
||||
auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
|
||||
and optimization steps <train-a-model-system-validation>` to set up your system
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
|
||||
@@ -182,8 +175,8 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
|
||||
|
||||
.. _amd-maxtext-download-docker:
|
||||
|
||||
Pull the Docker image
|
||||
---------------------
|
||||
Download the Docker image
|
||||
-------------------------
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
|
||||
@@ -103,18 +103,11 @@ popular AI models.
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
If you have already validated your system settings, including NUMA
|
||||
auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
|
||||
and optimization steps <train-a-model-system-validation>` to set up your system
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
.. _mi300x-amd-megatron-lm-training:
|
||||
|
||||
Environment setup
|
||||
|
||||
@@ -34,18 +34,11 @@ for MPT-30B with access to detailed logs and performance metrics.
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
If you have already validated your system settings, including NUMA
|
||||
auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
|
||||
and optimization steps <train-a-model-system-validation>` to set up your system
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
Getting started
|
||||
===============
|
||||
|
||||
|
||||
@@ -77,18 +77,11 @@ popular AI models.
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
If you have already validated your system settings, including NUMA
|
||||
auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
|
||||
and optimization steps <train-a-model-system-validation>` to set up your system
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t validate configurations and run conditions outside those described.
|
||||
|
||||
@@ -21,12 +21,8 @@ In this guide, you'll learn about:
|
||||
|
||||
- Training a model
|
||||
|
||||
- :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`
|
||||
- :doc:`Train a model with Megatron-LM <benchmark-docker/megatron-lm>`
|
||||
|
||||
- :doc:`With PyTorch <benchmark-docker/pytorch-training>`
|
||||
|
||||
- :doc:`With JAX MaxText <benchmark-docker/jax-maxtext>`
|
||||
|
||||
- :doc:`With LLM Foundry <benchmark-docker/mpt-llm-foundry>`
|
||||
- :doc:`Train a model with PyTorch <benchmark-docker/pytorch-training>`
|
||||
|
||||
- :doc:`Scaling model training <scale-model-training>`
|
||||
|
||||
@@ -5,13 +5,12 @@
|
||||
:keywords: ROCm, AI, LLM, train, megatron, Llama, tutorial, docker, torch, pytorch, jax
|
||||
|
||||
.. _train-a-model-system-validation:
|
||||
.. _rocm-for-ai-system-optimization:
|
||||
|
||||
**********************************************************
|
||||
Prerequisite system validation before running AI workloads
|
||||
**********************************************************
|
||||
**********************************************
|
||||
Prerequisite system validation before training
|
||||
**********************************************
|
||||
|
||||
Complete the following system validation and optimization steps to set up your system before starting training and inference.
|
||||
Complete the following system validation and optimization steps to set up your system before starting training.
|
||||
|
||||
Disable NUMA auto-balancing
|
||||
---------------------------
|
||||
@@ -27,8 +26,7 @@ the output is ``1``, run the following command to disable NUMA auto-balancing.
|
||||
|
||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
|
||||
See `Disable NUMA auto-balancing <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#disable-numa-auto-balancing>`_
|
||||
in the Instinct documentation for more information.
|
||||
See :ref:`mi300x-disable-numa` for more information.
|
||||
|
||||
Hardware verification with ROCm
|
||||
-------------------------------
|
||||
@@ -44,8 +42,7 @@ Run the command:
|
||||
|
||||
rocm-smi --setperfdeterminism 1900
|
||||
|
||||
See `Hardware verfication for ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_
|
||||
in the Instinct documentation for more information.
|
||||
See :ref:`mi300x-hardware-verification-with-rocm` for more information.
|
||||
|
||||
RCCL Bandwidth Test for multi-node setups
|
||||
-----------------------------------------
|
||||
|
||||
@@ -36,10 +36,6 @@ subtrees:
|
||||
title: Use ROCm for AI
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: how-to/rocm-for-ai/install.rst
|
||||
title: Installation
|
||||
- file: how-to/rocm-for-ai/system-health-check.rst
|
||||
title: System health benchmarks
|
||||
- file: how-to/rocm-for-ai/training/index.rst
|
||||
title: Training
|
||||
subtrees:
|
||||
@@ -74,6 +70,8 @@ subtrees:
|
||||
title: Inference
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: how-to/rocm-for-ai/inference/install.rst
|
||||
title: Installation
|
||||
- file: how-to/rocm-for-ai/inference/hugging-face-models.rst
|
||||
title: Run models from Hugging Face
|
||||
- file: how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
|
||||
|
||||
@@ -115,7 +115,7 @@ $(call adddep,roctracer,${ASAN_DEP} rocr hip_on_rocclr)
|
||||
|
||||
|
||||
# rocm-dev points to all possible last finish components of Stage1 build.
|
||||
rocm-dev-components :=amd_smi_lib aqlprofile comgr dbgapi devicelibs hip_on_rocclr hipcc hipify_clang \
|
||||
rocm-dev-components :=amd_smi_lib aqlprofile aqlprofiletest comgr dbgapi devicelibs hip_on_rocclr hipcc hipify_clang \
|
||||
lightning rocprofiler-compute opencl_on_rocclr openmp_extras rocm_bandwidth_test rocm_smi_lib \
|
||||
rocm-cmake rocm-core rocm-gdb rocminfo rocprofiler-register rocprofiler-sdk rocprofiler-systems \
|
||||
rocprofiler rocr rocr_debug_agent rocrsamples roctracer
|
||||
|
||||
@@ -60,6 +60,7 @@ libfile-find-rule-perl
|
||||
libgflags-dev
|
||||
libglew-dev
|
||||
libgmp-dev
|
||||
libgoogle-glog-dev
|
||||
libgtk2.0-dev
|
||||
libhdf5-serial-dev
|
||||
libjpeg-dev
|
||||
@@ -89,6 +90,7 @@ libsuitesparse-dev
|
||||
libsystemd-dev
|
||||
libtinfo-dev
|
||||
libtool
|
||||
libunwind-dev
|
||||
liburi-encode-perl
|
||||
libva-dev
|
||||
libvirt-clients
|
||||
@@ -96,6 +98,7 @@ libvirt-daemon-system
|
||||
libyaml-cpp-dev
|
||||
libzstd-dev
|
||||
llvm
|
||||
llvm-6.0-dev
|
||||
llvm-dev
|
||||
llvm-runtime
|
||||
mesa-common-dev
|
||||
@@ -109,7 +112,8 @@ pigz
|
||||
pkg-config
|
||||
protobuf-compiler
|
||||
python-is-python3
|
||||
python3-pip-whl
|
||||
python-pip-whl
|
||||
python-yaml
|
||||
python3-dev
|
||||
python3-pip
|
||||
python3-venv
|
||||
|
||||
@@ -17,7 +17,7 @@ git --version
|
||||
|
||||
# venv for python to be able to run pip3 without --break-system-packages
|
||||
python3 -m venv /opt/venv
|
||||
source /opt/venv/bin/activate
|
||||
|
||||
pip3 install CppHeaderParser argparse lxml recommonmark jinja2==3.0.0 \
|
||||
websockets matplotlib numpy scipy minimal msgpack pytest sphinx joblib PyYAML rocm-docs-core cmake==3.25.2 pandas \
|
||||
myst-parser setuptools lit
|
||||
|
||||
@@ -217,7 +217,7 @@ export RCCL_ROOT=$WORK_ROOT/rccl
|
||||
export ROCM_DBGAPI_ROOT=$WORK_ROOT/ROCdbgapi
|
||||
export ROCM_GDB_ROOT=$WORK_ROOT/ROCgdb
|
||||
# export ROCclr_ROOT=$WORK_ROOT/vdi
|
||||
export HIP_ON_ROCclr_ROOT=$WORK_ROOT/hip
|
||||
export HIP_ON_ROCclr_ROOT=$WORK_ROOT/HIP
|
||||
export HIPAMD_ROOT=$WORK_ROOT/hipamd
|
||||
export HIP_CATCH_TESTS_ROOT=$WORK_ROOT/hip-tests
|
||||
# export OPENCL_ON_ROCclr_ROOT=$WORK_ROOT/opencl-on-vdi
|
||||
|
||||
Reference in New Issue
Block a user