mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-21 04:28:01 -05:00
Compare commits
54 Commits
deep-frame
...
deep-711
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2b83a962a0 | ||
|
|
54bf4c0319 | ||
|
|
4347a11bc4 | ||
|
|
2b7fde505f | ||
|
|
a98d6a5777 | ||
|
|
38b271df55 | ||
|
|
4184d1ee1f | ||
|
|
0786c328c1 | ||
|
|
88ea6072f5 | ||
|
|
b32dcc8570 | ||
|
|
0faa92e922 | ||
|
|
26ae989602 | ||
|
|
4402dc4147 | ||
|
|
5eda438e0a | ||
|
|
049784e1a7 | ||
|
|
f12169c5b7 | ||
|
|
b35d1a0627 | ||
|
|
912618cb08 | ||
|
|
7d2feaa8b1 | ||
|
|
7d0d114994 | ||
|
|
2a65394e32 | ||
|
|
268c1332c9 | ||
|
|
374e0944dc | ||
|
|
512e311041 | ||
|
|
ad4f486635 | ||
|
|
485886712b | ||
|
|
1cd6a14a22 | ||
|
|
a17f04a3b5 | ||
|
|
94de66ef3f | ||
|
|
e5cebe7b4e | ||
|
|
7047cfa19c | ||
|
|
de71bf5fa7 | ||
|
|
0d17c96f7f | ||
|
|
2f8c99f7f0 | ||
|
|
982927e866 | ||
|
|
8f45b791fe | ||
|
|
f7c7587b10 | ||
|
|
96b3c0d4f3 | ||
|
|
d6d4d2ef92 | ||
|
|
8647ebcf76 | ||
|
|
48ca38b0dc | ||
|
|
acbd671e99 | ||
|
|
133a97ec18 | ||
|
|
2d40066f29 | ||
|
|
5d7fdace0e | ||
|
|
7dbcdc5deb | ||
|
|
a966db29ca | ||
|
|
9ea8a48b3a | ||
|
|
9956d72614 | ||
|
|
305d24f486 | ||
|
|
26f6b6b3e1 | ||
|
|
d4cdbd79a3 | ||
|
|
26d1ab7d27 | ||
|
|
272c9f6be3 |
@@ -34,7 +34,6 @@ parameters:
|
|||||||
default:
|
default:
|
||||||
- cmake
|
- cmake
|
||||||
- libnuma-dev
|
- libnuma-dev
|
||||||
- libsimde-dev
|
|
||||||
- mesa-common-dev
|
- mesa-common-dev
|
||||||
- ninja-build
|
- ninja-build
|
||||||
- ocl-icd-libopencl1
|
- ocl-icd-libopencl1
|
||||||
|
|||||||
@@ -39,7 +39,6 @@ parameters:
|
|||||||
- python3
|
- python3
|
||||||
- python3-dev
|
- python3-dev
|
||||||
- python3-pip
|
- python3-pip
|
||||||
- python3-venv
|
|
||||||
- libgtest-dev
|
- libgtest-dev
|
||||||
- libboost-filesystem-dev
|
- libboost-filesystem-dev
|
||||||
- libboost-program-options-dev
|
- libboost-program-options-dev
|
||||||
@@ -47,8 +46,6 @@ parameters:
|
|||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
- nanobind>=2.0.0
|
- nanobind>=2.0.0
|
||||||
- pytest
|
|
||||||
- pytest-cov
|
|
||||||
- name: rocmDependencies
|
- name: rocmDependencies
|
||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
@@ -75,10 +72,8 @@ parameters:
|
|||||||
- { os: ubuntu2204, packageManager: apt }
|
- { os: ubuntu2204, packageManager: apt }
|
||||||
- { os: almalinux8, packageManager: dnf }
|
- { os: almalinux8, packageManager: dnf }
|
||||||
testJobs:
|
testJobs:
|
||||||
|
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||||
# - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
|
||||||
# - { os: ubuntu2204, packageManager: apt, target: gfx1151 }
|
|
||||||
# - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
|
|
||||||
- name: downstreamComponentMatrix
|
- name: downstreamComponentMatrix
|
||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
@@ -121,11 +116,6 @@ jobs:
|
|||||||
parameters:
|
parameters:
|
||||||
dependencyList:
|
dependencyList:
|
||||||
- gtest
|
- gtest
|
||||||
- ${{ if ne(job.os, 'almalinux8') }}:
|
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
|
||||||
parameters:
|
|
||||||
dependencyList:
|
|
||||||
- catch2
|
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||||
parameters:
|
parameters:
|
||||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||||
@@ -147,7 +137,6 @@ jobs:
|
|||||||
-DORIGAMI_BUILD_SHARED_LIBS=ON
|
-DORIGAMI_BUILD_SHARED_LIBS=ON
|
||||||
-DORIGAMI_ENABLE_PYTHON=ON
|
-DORIGAMI_ENABLE_PYTHON=ON
|
||||||
-DORIGAMI_BUILD_TESTING=ON
|
-DORIGAMI_BUILD_TESTING=ON
|
||||||
-DORIGAMI_ENABLE_FETCH=ON
|
|
||||||
-GNinja
|
-GNinja
|
||||||
- ${{ if ne(job.os, 'almalinux8') }}:
|
- ${{ if ne(job.os, 'almalinux8') }}:
|
||||||
- task: PublishPipelineArtifact@1
|
- task: PublishPipelineArtifact@1
|
||||||
@@ -180,6 +169,7 @@ jobs:
|
|||||||
dependsOn: origami_build_${{ job.os }}
|
dependsOn: origami_build_${{ job.os }}
|
||||||
condition:
|
condition:
|
||||||
and(succeeded(),
|
and(succeeded(),
|
||||||
|
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||||
eq(${{ parameters.aggregatePipeline }}, False)
|
eq(${{ parameters.aggregatePipeline }}, False)
|
||||||
)
|
)
|
||||||
@@ -190,30 +180,30 @@ jobs:
|
|||||||
workspace:
|
workspace:
|
||||||
clean: all
|
clean: all
|
||||||
steps:
|
steps:
|
||||||
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||||
|
parameters:
|
||||||
|
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||||
|
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||||
parameters:
|
parameters:
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
pipModules: ${{ parameters.pipModules }}
|
pipModules: ${{ parameters.pipModules }}
|
||||||
packageManager: ${{ job.packageManager }}
|
packageManager: ${{ job.packageManager }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
|
||||||
parameters:
|
|
||||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
|
||||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
|
||||||
parameters:
|
|
||||||
dependencyList:
|
|
||||||
- gtest
|
|
||||||
- ${{ if ne(job.os, 'almalinux8') }}:
|
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
|
||||||
parameters:
|
|
||||||
dependencyList:
|
|
||||||
- catch2
|
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||||
parameters:
|
parameters:
|
||||||
preTargetFilter: ${{ parameters.componentName }}
|
preTargetFilter: ${{ parameters.componentName }}
|
||||||
os: ${{ job.os }}
|
os: ${{ job.os }}
|
||||||
|
- task: DownloadPipelineArtifact@2
|
||||||
|
displayName: 'Download Build Directory Artifact'
|
||||||
|
inputs:
|
||||||
|
artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
|
||||||
|
path: '$(Agent.BuildDirectory)/s/build'
|
||||||
|
- task: DownloadPipelineArtifact@2
|
||||||
|
displayName: 'Download Python Source Artifact'
|
||||||
|
inputs:
|
||||||
|
artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
|
||||||
|
path: '$(Agent.BuildDirectory)/s/python'
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||||
parameters:
|
parameters:
|
||||||
checkoutRef: ${{ parameters.checkoutRef }}
|
checkoutRef: ${{ parameters.checkoutRef }}
|
||||||
@@ -222,72 +212,25 @@ jobs:
|
|||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
${{ if parameters.triggerDownstreamJobs }}:
|
${{ if parameters.triggerDownstreamJobs }}:
|
||||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||||
- task: CMake@1
|
|
||||||
displayName: 'Origami Test CMake Configuration'
|
|
||||||
inputs:
|
|
||||||
cmakeArgs: >-
|
|
||||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
|
|
||||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
|
||||||
-DORIGAMI_BUILD_SHARED_LIBS=ON
|
|
||||||
-DORIGAMI_ENABLE_PYTHON=ON
|
|
||||||
-DORIGAMI_BUILD_TESTING=ON
|
|
||||||
-GNinja
|
|
||||||
$(Agent.BuildDirectory)/s
|
|
||||||
- task: Bash@3
|
|
||||||
displayName: 'Build Origami Tests and Python Bindings'
|
|
||||||
inputs:
|
|
||||||
targetType: inline
|
|
||||||
workingDirectory: build
|
|
||||||
script: |
|
|
||||||
cmake --build . --target origami-tests origami_python -- -j$(nproc)
|
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||||
# Run tests using CTest (discovers and runs both C++ and Python tests)
|
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||||
parameters:
|
parameters:
|
||||||
componentName: ${{ parameters.componentName }}
|
componentName: ${{ parameters.componentName }}
|
||||||
os: ${{ job.os }}
|
os: ${{ job.os }}
|
||||||
testDir: 'build'
|
testDir: '$(Agent.BuildDirectory)/rocm/bin'
|
||||||
testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml'
|
testExecutable: './origami-tests'
|
||||||
# Test pip install workflow
|
testParameters: '--yaml origami-tests.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
|
||||||
# - task: Bash@3
|
- script: |
|
||||||
# displayName: 'Test Pip Install'
|
set -e
|
||||||
# inputs:
|
export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
|
||||||
# targetType: inline
|
|
||||||
# script: |
|
|
||||||
# set -e
|
|
||||||
|
|
||||||
# echo "==================================================================="
|
echo "--- Running origami_test.py ---"
|
||||||
# echo "Testing pip install workflow (pip install -e .)"
|
python3 $(Agent.BuildDirectory)/s/python/origami_test.py
|
||||||
# echo "==================================================================="
|
|
||||||
|
|
||||||
# # Set environment variables for pip install CMake build
|
echo "--- Running origami_grid_test.py ---"
|
||||||
# export ROCM_PATH=$(Agent.BuildDirectory)/rocm
|
python3 $(Agent.BuildDirectory)/s/python/origami_grid_test.py
|
||||||
# export CMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm:$(Agent.BuildDirectory)/vendor
|
displayName: 'Run Python Binding Tests'
|
||||||
# export CMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
condition: succeeded()
|
||||||
|
|
||||||
# echo "ROCM_PATH: $ROCM_PATH"
|
|
||||||
# echo "CMAKE_PREFIX_PATH: $CMAKE_PREFIX_PATH"
|
|
||||||
# echo "CMAKE_CXX_COMPILER: $CMAKE_CXX_COMPILER"
|
|
||||||
# echo ""
|
|
||||||
|
|
||||||
# # Install from source directory
|
|
||||||
# cd "$(Agent.BuildDirectory)/s/python"
|
|
||||||
# pip install -e .
|
|
||||||
|
|
||||||
# # Verify import works
|
|
||||||
# echo ""
|
|
||||||
# echo "Verifying origami can be imported..."
|
|
||||||
# python3 -c "import origami; print('✓ Successfully imported origami')"
|
|
||||||
|
|
||||||
# # Run pytest on installed package
|
|
||||||
# echo ""
|
|
||||||
# echo "Running pytest tests..."
|
|
||||||
# python3 -m pytest tests/ -v -m "not slow" --tb=short
|
|
||||||
|
|
||||||
# echo ""
|
|
||||||
# echo "==================================================================="
|
|
||||||
# echo "Pip install test completed successfully"
|
|
||||||
# echo "==================================================================="
|
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||||
parameters:
|
parameters:
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
|
|||||||
@@ -30,7 +30,6 @@ parameters:
|
|||||||
- python3-pip
|
- python3-pip
|
||||||
- protobuf-compiler
|
- protobuf-compiler
|
||||||
- libprotoc-dev
|
- libprotoc-dev
|
||||||
- libopencv-dev
|
|
||||||
- name: pipModules
|
- name: pipModules
|
||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
@@ -65,7 +64,6 @@ parameters:
|
|||||||
- MIVisionX
|
- MIVisionX
|
||||||
- rocm_smi_lib
|
- rocm_smi_lib
|
||||||
- rccl
|
- rccl
|
||||||
- rocAL
|
|
||||||
- rocALUTION
|
- rocALUTION
|
||||||
- rocBLAS
|
- rocBLAS
|
||||||
- rocDecode
|
- rocDecode
|
||||||
@@ -105,7 +103,6 @@ parameters:
|
|||||||
- MIVisionX
|
- MIVisionX
|
||||||
- rocm_smi_lib
|
- rocm_smi_lib
|
||||||
- rccl
|
- rccl
|
||||||
- rocAL
|
|
||||||
- rocALUTION
|
- rocALUTION
|
||||||
- rocBLAS
|
- rocBLAS
|
||||||
- rocDecode
|
- rocDecode
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ Andrej
|
|||||||
Arb
|
Arb
|
||||||
Autocast
|
Autocast
|
||||||
autograd
|
autograd
|
||||||
|
Backported
|
||||||
BARs
|
BARs
|
||||||
BatchNorm
|
BatchNorm
|
||||||
BLAS
|
BLAS
|
||||||
@@ -203,9 +204,11 @@ GenAI
|
|||||||
GenZ
|
GenZ
|
||||||
GitHub
|
GitHub
|
||||||
Gitpod
|
Gitpod
|
||||||
|
hardcoded
|
||||||
HBM
|
HBM
|
||||||
HCA
|
HCA
|
||||||
HGX
|
HGX
|
||||||
|
HLO
|
||||||
HIPCC
|
HIPCC
|
||||||
hipDataType
|
hipDataType
|
||||||
HIPExtension
|
HIPExtension
|
||||||
@@ -333,6 +336,7 @@ MoEs
|
|||||||
Mooncake
|
Mooncake
|
||||||
Mpops
|
Mpops
|
||||||
Multicore
|
Multicore
|
||||||
|
multihost
|
||||||
Multithreaded
|
Multithreaded
|
||||||
mx
|
mx
|
||||||
MXFP
|
MXFP
|
||||||
@@ -1027,6 +1031,7 @@ uncacheable
|
|||||||
uncorrectable
|
uncorrectable
|
||||||
underoptimized
|
underoptimized
|
||||||
unhandled
|
unhandled
|
||||||
|
unfused
|
||||||
uninstallation
|
uninstallation
|
||||||
unmapped
|
unmapped
|
||||||
unsqueeze
|
unsqueeze
|
||||||
|
|||||||
22
RELEASE.md
22
RELEASE.md
@@ -270,26 +270,26 @@ The [ROCm examples repository](https://github.com/ROCm/rocm-examples) has been e
|
|||||||
:margin: auto 0 auto auto
|
:margin: auto 0 auto auto
|
||||||
:::{grid}
|
:::{grid}
|
||||||
:margin: auto 0 auto auto
|
:margin: auto 0 auto auto
|
||||||
* [hipBLASLt](https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/)
|
* [hipBLASLt](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/hipBLASLt)
|
||||||
* [hipSPARSE](https://rocm.docs.amd.com/projects/hipSPARSE/en/latest/)
|
* [hipSPARSE](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/hipSPARSE)
|
||||||
* [hipSPARSELt](https://rocm.docs.amd.com/projects/hipSPARSELt/en/latest/)
|
* [hipSPARSELt](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/hipSPARSELt)
|
||||||
* [hipTensor](https://rocm.docs.amd.com/projects/hipTensor/en/latest/)
|
* [hipTensor](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/hipTensor)
|
||||||
:::
|
:::
|
||||||
:::{grid}
|
:::{grid}
|
||||||
:margin: auto 0 auto auto
|
:margin: auto 0 auto auto
|
||||||
* [rocALUTION](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/)
|
* [rocALUTION](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/rocALUTION)
|
||||||
* [ROCprofiler-SDK](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/)
|
* [ROCprofiler-SDK](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/rocProfiler-SDK)
|
||||||
* [rocWMMA](https://rocm.docs.amd.com/projects/rocWMMA/en/latest/)
|
* [rocWMMA](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/rocWMMA)
|
||||||
:::
|
:::
|
||||||
::::
|
::::
|
||||||
|
|
||||||
Usage examples are now available for the following performance analysis tools:
|
Usage examples are now available for the following performance analysis tools:
|
||||||
|
|
||||||
* [ROCm Compute Profiler](https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/index.html)
|
* [ROCm Compute Profiler](https://github.com/ROCm/rocm-examples/tree/amd-staging/Tools/rocprof-compute)
|
||||||
* [ROCm Systems Profiler](https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/index.html)
|
* [ROCm Systems Profiler](https://github.com/ROCm/rocm-examples/tree/amd-staging/Tools/rocprof-systems)
|
||||||
* [rocprofv3](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/how-to/using-rocprofv3.html)
|
* [rocprofv3](https://github.com/ROCm/rocm-examples/tree/amd-staging/Tools/rocprofv3)
|
||||||
|
|
||||||
The complete source code for the [HIP Graph Tutorial](https://rocm.docs.amd.com/projects/HIP/en/latest/tutorial/graph_api.html) is also available as part of the ROCm examples.
|
The complete source code for the [HIP Graph Tutorial](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Tutorials/graph_api) is also available as part of the ROCm examples.
|
||||||
|
|
||||||
### ROCm documentation updates
|
### ROCm documentation updates
|
||||||
|
|
||||||
|
|||||||
@@ -269,6 +269,33 @@ For a complete and up-to-date list of JAX public modules (for example, ``jax.num
|
|||||||
JAX API modules are maintained by the JAX project and is subject to change.
|
JAX API modules are maintained by the JAX project and is subject to change.
|
||||||
Refer to the official Jax documentation for the most up-to-date information.
|
Refer to the official Jax documentation for the most up-to-date information.
|
||||||
|
|
||||||
|
Key features and enhancements for ROCm 7.1
|
||||||
|
===============================================================================
|
||||||
|
|
||||||
|
- Enabled compilation of multihost HLO runner Python bindings.
|
||||||
|
|
||||||
|
- Backported multihost HLO runner bindings and some related changes to
|
||||||
|
:code:`FunctionalHloRunner`.
|
||||||
|
|
||||||
|
- Added :code:`requirements_lock_3_12` to enable building for Python 3.12.
|
||||||
|
|
||||||
|
- Removed hardcoded NHWC convolution layout for ``fp16`` precision to address the performance drops for ``fp16`` precision on gfx12xx GPUs.
|
||||||
|
|
||||||
|
|
||||||
|
- ROCprofiler-SDK integration:
|
||||||
|
|
||||||
|
- Integrated ROCprofiler-SDK (v3) to XLA to improve profiling of GPU events,
|
||||||
|
support both time-based and step-based profiling.
|
||||||
|
|
||||||
|
- Added unit tests for :code:`rocm_collector` and :code:`rocm_tracer`.
|
||||||
|
|
||||||
|
- Added Triton unsupported conversion from ``f8E4M3FNUZ`` to ``fp16`` with
|
||||||
|
rounding mode.
|
||||||
|
|
||||||
|
- Introduced :code:`CudnnFusedConvDecomposer` to revert fused convolutions
|
||||||
|
when :code:`ConvAlgorithmPicker` fails to find a fused algorithm, and removed
|
||||||
|
unfused fallback paths from :code:`RocmFusedConvRunner`.
|
||||||
|
|
||||||
Key features and enhancements for ROCm 7.0
|
Key features and enhancements for ROCm 7.0
|
||||||
===============================================================================
|
===============================================================================
|
||||||
|
|
||||||
|
|||||||
@@ -268,6 +268,3 @@ html_context = {
|
|||||||
"granularity_type" : [('Coarse-grained', 'coarse-grained'), ('Fine-grained', 'fine-grained')],
|
"granularity_type" : [('Coarse-grained', 'coarse-grained'), ('Fine-grained', 'fine-grained')],
|
||||||
"scope_type" : [('Device', 'device'), ('System', 'system')]
|
"scope_type" : [('Device', 'device'), ('System', 'system')]
|
||||||
}
|
}
|
||||||
|
|
||||||
# Disable figure and table numbering
|
|
||||||
numfig = False
|
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
- GitHub
|
- GitHub
|
||||||
|
|
||||||
* - :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`
|
* - :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`
|
||||||
- :doc:`Pytorch install <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
- :doc:`link <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- Wheels package
|
- Wheels package
|
||||||
@@ -35,7 +35,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
<a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`
|
* - :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`
|
||||||
- :doc:`TensorFlow install <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
- :doc:`link <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- Wheels package
|
- Wheels package
|
||||||
@@ -45,7 +45,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
<a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`
|
* - :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`
|
||||||
- :doc:`JAX install <rocm-install-on-linux:install/3rd-party/jax-install>`
|
- :doc:`link <rocm-install-on-linux:install/3rd-party/jax-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- .. raw:: html
|
- .. raw:: html
|
||||||
@@ -53,7 +53,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
<a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>`
|
* - :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>`
|
||||||
- :doc:`verl install <rocm-install-on-linux:install/3rd-party/verl-install>`
|
- :doc:`link <rocm-install-on-linux:install/3rd-party/verl-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- .. raw:: html
|
- .. raw:: html
|
||||||
@@ -61,7 +61,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
<a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`
|
* - :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`
|
||||||
- :doc:`Stanford Megatron-LM install <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
|
- :doc:`link <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- .. raw:: html
|
- .. raw:: html
|
||||||
@@ -69,7 +69,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
<a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>`
|
* - :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>`
|
||||||
- :doc:`DGL install <rocm-install-on-linux:install/3rd-party/dgl-install>`
|
- :doc:`link <rocm-install-on-linux:install/3rd-party/dgl-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- .. raw:: html
|
- .. raw:: html
|
||||||
@@ -77,24 +77,15 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
<a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`
|
* - :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`
|
||||||
- :doc:`Megablocks install <rocm-install-on-linux:install/3rd-party/megablocks-install>`
|
- :doc:`link <rocm-install-on-linux:install/3rd-party/megablocks-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- .. raw:: html
|
- .. raw:: html
|
||||||
|
|
||||||
<a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>`
|
|
||||||
- `Taichi install <https://rocm.docs.amd.com/projects/taichi/en/latest/install/taichi-install.html>`__
|
|
||||||
-
|
|
||||||
- Docker image
|
|
||||||
- Wheels package
|
|
||||||
- .. raw:: html
|
|
||||||
|
|
||||||
<a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
|
|
||||||
|
|
||||||
* - :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>`
|
* - :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>`
|
||||||
- :doc:`Ray install <rocm-install-on-linux:install/3rd-party/ray-install>`
|
- :doc:`link <rocm-install-on-linux:install/3rd-party/ray-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- Wheels package
|
- Wheels package
|
||||||
@@ -104,7 +95,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
<a href="https://github.com/ROCm/ray"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/ray"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>`
|
* - :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>`
|
||||||
- :doc:`llama.cpp install <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
|
- :doc:`link <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- ROCm Base Docker image
|
- ROCm Base Docker image
|
||||||
@@ -113,7 +104,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
<a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>`
|
* - :doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>`
|
||||||
- :doc:`FlashInfer install <rocm-install-on-linux:install/3rd-party/flashinfer-install>`
|
- :doc:`link <rocm-install-on-linux:install/3rd-party/flashinfer-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- ROCm Base Docker image
|
- ROCm Base Docker image
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ Setting up the base implementation environment
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
rocm-smi --showproductname
|
amd-smi static --board
|
||||||
|
|
||||||
#. Check that your GPUs are available to PyTorch.
|
#. Check that your GPUs are available to PyTorch.
|
||||||
|
|
||||||
@@ -65,8 +65,8 @@ Setting up the base implementation environment
|
|||||||
|
|
||||||
.. tip::
|
.. tip::
|
||||||
|
|
||||||
During training and inference, you can check the memory usage by running the ``rocm-smi`` command in your terminal.
|
During training and inference, you can check the memory usage by running the ``amd-smi`` command in your terminal.
|
||||||
This tool helps you see shows which GPUs are involved.
|
This tool helps you see which GPUs are involved.
|
||||||
|
|
||||||
|
|
||||||
.. _fine-tuning-llms-multi-gpu-hugging-face-accelerate:
|
.. _fine-tuning-llms-multi-gpu-hugging-face-accelerate:
|
||||||
@@ -130,7 +130,7 @@ After loading the model in this way, the model is fully ready to use the resourc
|
|||||||
torchtune for fine-tuning and inference
|
torchtune for fine-tuning and inference
|
||||||
=============================================
|
=============================================
|
||||||
|
|
||||||
`torchtune <https://meta-pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-GPU
|
`torchtune <https://pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-GPU
|
||||||
model fine-tuning and inference with LLMs.
|
model fine-tuning and inference with LLMs.
|
||||||
|
|
||||||
#. Install torchtune using pip.
|
#. Install torchtune using pip.
|
||||||
|
|||||||
@@ -44,20 +44,19 @@ Setting up the base implementation environment
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
rocm-smi --showproductname
|
amd-smi static --board
|
||||||
|
|
||||||
Your output should look like this:
|
Your output should look like this:
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
============================ ROCm System Management Interface ============================
|
GPU: 0
|
||||||
====================================== Product Info ======================================
|
BOARD:
|
||||||
GPU[0] : Card Series: AMD Instinct MI300X OAM
|
MODEL_NUMBER: 102-G39203-0B
|
||||||
GPU[0] : Card model: 0x74a1
|
PRODUCT_SERIAL: PCB079220-1150
|
||||||
GPU[0] : Card vendor: Advanced Micro Devices, Inc. [AMD/ATI]
|
FRU_ID: 113-AMDG392030B04-100-300000097H
|
||||||
GPU[0] : Card SKU: MI3SRIOV
|
PRODUCT_NAME: AMD Instinct MI325 OAM
|
||||||
==========================================================================================
|
MANUFACTURER_NAME: AMD
|
||||||
================================== End of ROCm SMI Log ===================================
|
|
||||||
|
|
||||||
#. Check that your GPUs are available to PyTorch.
|
#. Check that your GPUs are available to PyTorch.
|
||||||
|
|
||||||
|
|||||||
@@ -31,16 +31,16 @@ in the Instinct documentation for more information.
|
|||||||
Hardware verification with ROCm
|
Hardware verification with ROCm
|
||||||
-------------------------------
|
-------------------------------
|
||||||
|
|
||||||
Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
|
Use the command ``amd-smi set --perf-determinism 1900`` to set the max clock speed up to 1900 MHz
|
||||||
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
|
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
|
||||||
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
|
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
|
||||||
You can restore this setting to its default value with the ``rocm-smi -r`` command.
|
You can restore this setting to its default value with the ``amd-smi reset --clocks`` command.
|
||||||
|
|
||||||
Run the command:
|
Run the command:
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
rocm-smi --setperfdeterminism 1900
|
amd-smi set --perf-determinism 1900
|
||||||
|
|
||||||
See `Hardware verfication for ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_
|
See `Hardware verfication for ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_
|
||||||
in the Instinct documentation for more information.
|
in the Instinct documentation for more information.
|
||||||
|
|||||||
@@ -108,16 +108,16 @@ for more information.
|
|||||||
Hardware verification with ROCm
|
Hardware verification with ROCm
|
||||||
-------------------------------
|
-------------------------------
|
||||||
|
|
||||||
Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
|
Use the command ``amd-smi set --perf-determinism 1900`` to set the max clock speed up to 1900 MHz
|
||||||
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
|
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
|
||||||
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
|
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
|
||||||
You can restore this setting to its default value with the ``rocm-smi -r`` command.
|
You can restore this setting to its default value with the ``amd-smi reset --clocks`` command.
|
||||||
|
|
||||||
Run the command:
|
Run the command:
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
rocm-smi --setperfdeterminism 1900
|
amd-smi set --perf-determinism 1900
|
||||||
|
|
||||||
See `Hardware verification with ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_ for more information.
|
See `Hardware verification with ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_ for more information.
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
GPU hardware specifications
|
GPU hardware specifications
|
||||||
===========================================
|
===========================================
|
||||||
|
|
||||||
The following tables provide an overview of the hardware specifications for AMD Instinct™ GPUs, and AMD Radeon™ PRO and Radeon™ GPUs.
|
The following tables provide an overview of the hardware specifications for AMD Instinct™ GPUs, AMD Radeon™ PRO and Radeon™ GPUs, and AMD Ryzen™ APUs.
|
||||||
|
|
||||||
For more information about ROCm hardware compatibility, see the ROCm `Compatibility matrix <https://rocm.docs.amd.com/en/latest/compatibility/compatibility-matrix.html>`_.
|
For more information about ROCm hardware compatibility, see the ROCm `Compatibility matrix <https://rocm.docs.amd.com/en/latest/compatibility/compatibility-matrix.html>`_.
|
||||||
|
|
||||||
@@ -18,7 +18,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
|||||||
:name: instinct-arch-spec-table
|
:name: instinct-arch-spec-table
|
||||||
|
|
||||||
*
|
*
|
||||||
- Model
|
- Name
|
||||||
- Architecture
|
- Architecture
|
||||||
- LLVM target name
|
- LLVM target name
|
||||||
- VRAM (GiB)
|
- VRAM (GiB)
|
||||||
@@ -297,7 +297,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
|||||||
:name: radeon-pro-arch-spec-table
|
:name: radeon-pro-arch-spec-table
|
||||||
|
|
||||||
*
|
*
|
||||||
- Model
|
- Name
|
||||||
- Architecture
|
- Architecture
|
||||||
- LLVM target name
|
- LLVM target name
|
||||||
|
|
||||||
@@ -539,7 +539,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
|||||||
:name: radeon-arch-spec-table
|
:name: radeon-arch-spec-table
|
||||||
|
|
||||||
*
|
*
|
||||||
- Model
|
- Name
|
||||||
- Architecture
|
- Architecture
|
||||||
- LLVM target name
|
- LLVM target name
|
||||||
- VRAM (GiB)
|
- VRAM (GiB)
|
||||||
@@ -953,6 +953,127 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
|||||||
- 9
|
- 9
|
||||||
- 0
|
- 0
|
||||||
|
|
||||||
|
.. tab-item:: AMD Ryzen APUs
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
:name: ryzen-arch-spec-table
|
||||||
|
|
||||||
|
*
|
||||||
|
- Name
|
||||||
|
- Graphics model
|
||||||
|
- Architecture
|
||||||
|
- LLVM target name
|
||||||
|
- VRAM (GiB)
|
||||||
|
- Compute Units
|
||||||
|
- Wavefront Size
|
||||||
|
- LDS (KiB)
|
||||||
|
- Infinity Cache (MiB)
|
||||||
|
- L2 Cache (MiB)
|
||||||
|
- Graphics L1 Cache (KiB)
|
||||||
|
- L0 Vector Cache (KiB)
|
||||||
|
- L0 Scalar Cache (KiB)
|
||||||
|
- L0 Instruction Cache (KiB)
|
||||||
|
- VGPR File (KiB)
|
||||||
|
- SGPR File (KiB)
|
||||||
|
- GFXIP Major version
|
||||||
|
- GFXIP Minor version
|
||||||
|
*
|
||||||
|
- AMD Ryzen 7 7840U
|
||||||
|
- Radeon 780M
|
||||||
|
- RDNA3
|
||||||
|
- gfx1103
|
||||||
|
- Dynamic + carveout
|
||||||
|
- 12
|
||||||
|
- 32 or 64
|
||||||
|
- 128
|
||||||
|
- N/A
|
||||||
|
- 2
|
||||||
|
- 256
|
||||||
|
- 32
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 512
|
||||||
|
- 32
|
||||||
|
- 11
|
||||||
|
- 0
|
||||||
|
*
|
||||||
|
- AMD Ryzen 9 270
|
||||||
|
- Radeon 780M
|
||||||
|
- RDNA3
|
||||||
|
- gfx1103
|
||||||
|
- Dynamic + carveout
|
||||||
|
- 12
|
||||||
|
- 32 or 64
|
||||||
|
- 128
|
||||||
|
- N/A
|
||||||
|
- 2
|
||||||
|
- 256
|
||||||
|
- 32
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 512
|
||||||
|
- 32
|
||||||
|
- 11
|
||||||
|
- 0
|
||||||
|
*
|
||||||
|
- AMD Ryzen AI 9 HX 375
|
||||||
|
- Radeon 890M
|
||||||
|
- RDNA3.5
|
||||||
|
- gfx1150
|
||||||
|
- Dynamic + carveout
|
||||||
|
- 16
|
||||||
|
- 32 or 64
|
||||||
|
- 128
|
||||||
|
- N/A
|
||||||
|
- 2
|
||||||
|
- 256
|
||||||
|
- 32
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 512
|
||||||
|
- 32
|
||||||
|
- 11
|
||||||
|
- 5
|
||||||
|
*
|
||||||
|
- AMD Ryzen AI Max+ PRO 395
|
||||||
|
- Radeon 8060S
|
||||||
|
- RDNA3.5
|
||||||
|
- gfx1151
|
||||||
|
- Dynamic + carveout
|
||||||
|
- 40
|
||||||
|
- 32 or 64
|
||||||
|
- 128
|
||||||
|
- 32
|
||||||
|
- 2
|
||||||
|
- 256
|
||||||
|
- 32
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 768
|
||||||
|
- 32
|
||||||
|
- 11
|
||||||
|
- 5
|
||||||
|
*
|
||||||
|
- AMD Ryzen Al 7 350
|
||||||
|
- Radeon 860M
|
||||||
|
- RDNA3.5
|
||||||
|
- gfx1152
|
||||||
|
- Dynamic + carveout
|
||||||
|
- 8
|
||||||
|
- 32 or 64
|
||||||
|
- 128
|
||||||
|
- N/A
|
||||||
|
- 1
|
||||||
|
- 256
|
||||||
|
- 32
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 512
|
||||||
|
- 32
|
||||||
|
- 11
|
||||||
|
- 5
|
||||||
|
|
||||||
Glossary
|
Glossary
|
||||||
========
|
========
|
||||||
|
|
||||||
|
|||||||
@@ -29,27 +29,25 @@ subtrees:
|
|||||||
title: Deep learning frameworks
|
title: Deep learning frameworks
|
||||||
subtrees:
|
subtrees:
|
||||||
- entries:
|
- entries:
|
||||||
- file: compatibility/ml-compatibility/pytorch-compatibility
|
- file: compatibility/ml-compatibility/pytorch-compatibility.rst
|
||||||
title: PyTorch compatibility
|
title: PyTorch compatibility
|
||||||
- file: compatibility/ml-compatibility/tensorflow-compatibility
|
- file: compatibility/ml-compatibility/tensorflow-compatibility.rst
|
||||||
title: TensorFlow compatibility
|
title: TensorFlow compatibility
|
||||||
- file: compatibility/ml-compatibility/jax-compatibility
|
- file: compatibility/ml-compatibility/jax-compatibility.rst
|
||||||
title: JAX compatibility
|
title: JAX compatibility
|
||||||
- file: compatibility/ml-compatibility/verl-compatibility
|
- file: compatibility/ml-compatibility/verl-compatibility.rst
|
||||||
title: verl compatibility
|
title: verl compatibility
|
||||||
- file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility
|
- file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
|
||||||
title: Stanford Megatron-LM compatibility
|
title: Stanford Megatron-LM compatibility
|
||||||
- file: compatibility/ml-compatibility/dgl-compatibility
|
- file: compatibility/ml-compatibility/dgl-compatibility.rst
|
||||||
title: DGL compatibility
|
title: DGL compatibility
|
||||||
- file: compatibility/ml-compatibility/megablocks-compatibility
|
- file: compatibility/ml-compatibility/megablocks-compatibility.rst
|
||||||
title: Megablocks compatibility
|
title: Megablocks compatibility
|
||||||
- file: compatibility/ml-compatibility/taichi-compatibility
|
- file: compatibility/ml-compatibility/ray-compatibility.rst
|
||||||
title: Taichi compatibility
|
|
||||||
- file: compatibility/ml-compatibility/ray-compatibility
|
|
||||||
title: Ray compatibility
|
title: Ray compatibility
|
||||||
- file: compatibility/ml-compatibility/llama-cpp-compatibility
|
- file: compatibility/ml-compatibility/llama-cpp-compatibility.rst
|
||||||
title: llama.cpp compatibility
|
title: llama.cpp compatibility
|
||||||
- file: compatibility/ml-compatibility/flashinfer-compatibility
|
- file: compatibility/ml-compatibility/flashinfer-compatibility.rst
|
||||||
title: FlashInfer compatibility
|
title: FlashInfer compatibility
|
||||||
- file: how-to/build-rocm.rst
|
- file: how-to/build-rocm.rst
|
||||||
title: Build ROCm from source
|
title: Build ROCm from source
|
||||||
@@ -77,8 +75,14 @@ subtrees:
|
|||||||
- entries:
|
- entries:
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
|
||||||
title: Train a model with Primus and Megatron-LM
|
title: Train a model with Primus and Megatron-LM
|
||||||
|
entries:
|
||||||
|
- file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
|
||||||
|
title: Train a model with Megatron-LM
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
|
||||||
title: Train a model with Primus and PyTorch
|
title: Train a model with Primus and PyTorch
|
||||||
|
entries:
|
||||||
|
- file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
|
||||||
|
title: Train a model with PyTorch
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
|
||||||
title: Train a model with JAX MaxText
|
title: Train a model with JAX MaxText
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry
|
- file: how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry
|
||||||
@@ -117,6 +121,8 @@ subtrees:
|
|||||||
title: SGLang inference performance testing
|
title: SGLang inference performance testing
|
||||||
- file: how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
|
- file: how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
|
||||||
title: SGLang distributed inference with Mooncake
|
title: SGLang distributed inference with Mooncake
|
||||||
|
- file: how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
|
||||||
|
title: xDiT diffusion inference
|
||||||
- file: how-to/rocm-for-ai/inference/deploy-your-model.rst
|
- file: how-to/rocm-for-ai/inference/deploy-your-model.rst
|
||||||
title: Deploy your model
|
title: Deploy your model
|
||||||
|
|
||||||
@@ -134,6 +140,8 @@ subtrees:
|
|||||||
title: Profile and debug
|
title: Profile and debug
|
||||||
- file: how-to/rocm-for-ai/inference-optimization/workload.rst
|
- file: how-to/rocm-for-ai/inference-optimization/workload.rst
|
||||||
title: Workload optimization
|
title: Workload optimization
|
||||||
|
- file: how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
|
||||||
|
title: vLLM V1 performance optimization
|
||||||
|
|
||||||
- url: https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/
|
- url: https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/
|
||||||
title: AI tutorials
|
title: AI tutorials
|
||||||
@@ -180,7 +188,7 @@ subtrees:
|
|||||||
- file: conceptual/gpu-arch/mi300-mi200-performance-counters.rst
|
- file: conceptual/gpu-arch/mi300-mi200-performance-counters.rst
|
||||||
title: MI300 and MI200 performance counters
|
title: MI300 and MI200 performance counters
|
||||||
- file: conceptual/gpu-arch/mi350-performance-counters.rst
|
- file: conceptual/gpu-arch/mi350-performance-counters.rst
|
||||||
title: MI350 series performance counters
|
title: MI350 Series performance counters
|
||||||
- file: conceptual/gpu-arch/mi250.md
|
- file: conceptual/gpu-arch/mi250.md
|
||||||
title: MI250 microarchitecture
|
title: MI250 microarchitecture
|
||||||
subtrees:
|
subtrees:
|
||||||
@@ -214,6 +222,8 @@ subtrees:
|
|||||||
title: ROCm tools, compilers, and runtimes
|
title: ROCm tools, compilers, and runtimes
|
||||||
- file: reference/gpu-arch-specs.rst
|
- file: reference/gpu-arch-specs.rst
|
||||||
- file: reference/gpu-atomics-operation.rst
|
- file: reference/gpu-atomics-operation.rst
|
||||||
|
- file: reference/env-variables.rst
|
||||||
|
title: Environment variables
|
||||||
- file: reference/precision-support.rst
|
- file: reference/precision-support.rst
|
||||||
title: Data types and precision support
|
title: Data types and precision support
|
||||||
- file: reference/graph-safe-support.rst
|
- file: reference/graph-safe-support.rst
|
||||||
|
|||||||
@@ -123,7 +123,8 @@ Performance
|
|||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
`ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is a tool for visualizing and analyzing GPU thread trace data collected using :doc:`rocprofv3 <rocprofiler-sdk:index>`. Note that `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is in an early access state. Running production workloads is not recommended.
|
`ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is a tool for visualizing and analyzing GPU thread trace data collected using :doc:`rocprofv3 <rocprofiler-sdk:index>`.
|
||||||
|
Note that `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is in an early access state. Running production workloads is not recommended.
|
||||||
|
|
||||||
Development
|
Development
|
||||||
^^^^^^^^^^^
|
^^^^^^^^^^^
|
||||||
|
|||||||
Reference in New Issue
Block a user