mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-21 04:28:01 -05:00
Compare commits
48 Commits
deep-711
...
deep-frame
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8d183c2e95 | ||
|
|
3a7cfd3958 | ||
|
|
833fdf4c95 | ||
|
|
28f028d304 | ||
|
|
a745e45dcb | ||
|
|
8beac1891f | ||
|
|
773f5de407 | ||
|
|
b297ced032 | ||
|
|
2dc22ca890 | ||
|
|
85102079ed | ||
|
|
ba95e0e689 | ||
|
|
1691d369e9 | ||
|
|
172b0f7c08 | ||
|
|
c67fac78bd | ||
|
|
e0b8ec4dfb | ||
|
|
38f2d043dc | ||
|
|
3a43bacdda | ||
|
|
48d8fe139b | ||
|
|
7455fe57b8 | ||
|
|
52c0a47e84 | ||
|
|
cbab9a465d | ||
|
|
459283da3c | ||
|
|
1b4f25733d | ||
|
|
b287372be5 | ||
|
|
78e8baf147 | ||
|
|
3e0c8b47e3 | ||
|
|
c3f0b99cc0 | ||
|
|
c9d1679486 | ||
|
|
fdbef17d7b | ||
|
|
6592a41a7f | ||
|
|
65a936023b | ||
|
|
2a64949081 | ||
|
|
0a17434517 | ||
|
|
2be7e5ac1e | ||
|
|
ae80c4a31c | ||
|
|
dd89a692e1 | ||
|
|
bf74351e5a | ||
|
|
f2067767e0 | ||
|
|
effd4174fb | ||
|
|
453751a86f | ||
|
|
fb644412d5 | ||
|
|
e8fdc34b71 | ||
|
|
b4031ef23c | ||
|
|
d0bd4e6f03 | ||
|
|
0056b9453e | ||
|
|
3d1ad79766 | ||
|
|
8683bed11b | ||
|
|
847cd7c423 |
@@ -34,6 +34,7 @@ parameters:
|
|||||||
default:
|
default:
|
||||||
- cmake
|
- cmake
|
||||||
- libnuma-dev
|
- libnuma-dev
|
||||||
|
- libsimde-dev
|
||||||
- mesa-common-dev
|
- mesa-common-dev
|
||||||
- ninja-build
|
- ninja-build
|
||||||
- ocl-icd-libopencl1
|
- ocl-icd-libopencl1
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ parameters:
|
|||||||
- python3
|
- python3
|
||||||
- python3-dev
|
- python3-dev
|
||||||
- python3-pip
|
- python3-pip
|
||||||
|
- python3-venv
|
||||||
- libgtest-dev
|
- libgtest-dev
|
||||||
- libboost-filesystem-dev
|
- libboost-filesystem-dev
|
||||||
- libboost-program-options-dev
|
- libboost-program-options-dev
|
||||||
@@ -46,6 +47,8 @@ parameters:
|
|||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
- nanobind>=2.0.0
|
- nanobind>=2.0.0
|
||||||
|
- pytest
|
||||||
|
- pytest-cov
|
||||||
- name: rocmDependencies
|
- name: rocmDependencies
|
||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
@@ -72,8 +75,10 @@ parameters:
|
|||||||
- { os: ubuntu2204, packageManager: apt }
|
- { os: ubuntu2204, packageManager: apt }
|
||||||
- { os: almalinux8, packageManager: dnf }
|
- { os: almalinux8, packageManager: dnf }
|
||||||
testJobs:
|
testJobs:
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||||
|
# - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
||||||
|
# - { os: ubuntu2204, packageManager: apt, target: gfx1151 }
|
||||||
|
# - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
|
||||||
- name: downstreamComponentMatrix
|
- name: downstreamComponentMatrix
|
||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
@@ -116,6 +121,11 @@ jobs:
|
|||||||
parameters:
|
parameters:
|
||||||
dependencyList:
|
dependencyList:
|
||||||
- gtest
|
- gtest
|
||||||
|
- ${{ if ne(job.os, 'almalinux8') }}:
|
||||||
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||||
|
parameters:
|
||||||
|
dependencyList:
|
||||||
|
- catch2
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||||
parameters:
|
parameters:
|
||||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||||
@@ -137,6 +147,7 @@ jobs:
|
|||||||
-DORIGAMI_BUILD_SHARED_LIBS=ON
|
-DORIGAMI_BUILD_SHARED_LIBS=ON
|
||||||
-DORIGAMI_ENABLE_PYTHON=ON
|
-DORIGAMI_ENABLE_PYTHON=ON
|
||||||
-DORIGAMI_BUILD_TESTING=ON
|
-DORIGAMI_BUILD_TESTING=ON
|
||||||
|
-DORIGAMI_ENABLE_FETCH=ON
|
||||||
-GNinja
|
-GNinja
|
||||||
- ${{ if ne(job.os, 'almalinux8') }}:
|
- ${{ if ne(job.os, 'almalinux8') }}:
|
||||||
- task: PublishPipelineArtifact@1
|
- task: PublishPipelineArtifact@1
|
||||||
@@ -169,7 +180,6 @@ jobs:
|
|||||||
dependsOn: origami_build_${{ job.os }}
|
dependsOn: origami_build_${{ job.os }}
|
||||||
condition:
|
condition:
|
||||||
and(succeeded(),
|
and(succeeded(),
|
||||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
|
||||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||||
eq(${{ parameters.aggregatePipeline }}, False)
|
eq(${{ parameters.aggregatePipeline }}, False)
|
||||||
)
|
)
|
||||||
@@ -180,30 +190,30 @@ jobs:
|
|||||||
workspace:
|
workspace:
|
||||||
clean: all
|
clean: all
|
||||||
steps:
|
steps:
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
|
||||||
parameters:
|
|
||||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
|
||||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||||
parameters:
|
parameters:
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
pipModules: ${{ parameters.pipModules }}
|
pipModules: ${{ parameters.pipModules }}
|
||||||
packageManager: ${{ job.packageManager }}
|
packageManager: ${{ job.packageManager }}
|
||||||
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||||
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||||
|
parameters:
|
||||||
|
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||||
|
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||||
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||||
|
parameters:
|
||||||
|
dependencyList:
|
||||||
|
- gtest
|
||||||
|
- ${{ if ne(job.os, 'almalinux8') }}:
|
||||||
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||||
|
parameters:
|
||||||
|
dependencyList:
|
||||||
|
- catch2
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||||
parameters:
|
parameters:
|
||||||
preTargetFilter: ${{ parameters.componentName }}
|
preTargetFilter: ${{ parameters.componentName }}
|
||||||
os: ${{ job.os }}
|
os: ${{ job.os }}
|
||||||
- task: DownloadPipelineArtifact@2
|
|
||||||
displayName: 'Download Build Directory Artifact'
|
|
||||||
inputs:
|
|
||||||
artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
|
|
||||||
path: '$(Agent.BuildDirectory)/s/build'
|
|
||||||
- task: DownloadPipelineArtifact@2
|
|
||||||
displayName: 'Download Python Source Artifact'
|
|
||||||
inputs:
|
|
||||||
artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
|
|
||||||
path: '$(Agent.BuildDirectory)/s/python'
|
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||||
parameters:
|
parameters:
|
||||||
checkoutRef: ${{ parameters.checkoutRef }}
|
checkoutRef: ${{ parameters.checkoutRef }}
|
||||||
@@ -212,25 +222,72 @@ jobs:
|
|||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
${{ if parameters.triggerDownstreamJobs }}:
|
${{ if parameters.triggerDownstreamJobs }}:
|
||||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||||
|
- task: CMake@1
|
||||||
|
displayName: 'Origami Test CMake Configuration'
|
||||||
|
inputs:
|
||||||
|
cmakeArgs: >-
|
||||||
|
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
|
||||||
|
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||||
|
-DORIGAMI_BUILD_SHARED_LIBS=ON
|
||||||
|
-DORIGAMI_ENABLE_PYTHON=ON
|
||||||
|
-DORIGAMI_BUILD_TESTING=ON
|
||||||
|
-GNinja
|
||||||
|
$(Agent.BuildDirectory)/s
|
||||||
|
- task: Bash@3
|
||||||
|
displayName: 'Build Origami Tests and Python Bindings'
|
||||||
|
inputs:
|
||||||
|
targetType: inline
|
||||||
|
workingDirectory: build
|
||||||
|
script: |
|
||||||
|
cmake --build . --target origami-tests origami_python -- -j$(nproc)
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||||
|
# Run tests using CTest (discovers and runs both C++ and Python tests)
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||||
parameters:
|
parameters:
|
||||||
componentName: ${{ parameters.componentName }}
|
componentName: ${{ parameters.componentName }}
|
||||||
os: ${{ job.os }}
|
os: ${{ job.os }}
|
||||||
testDir: '$(Agent.BuildDirectory)/rocm/bin'
|
testDir: 'build'
|
||||||
testExecutable: './origami-tests'
|
testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml'
|
||||||
testParameters: '--yaml origami-tests.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
|
# Test pip install workflow
|
||||||
- script: |
|
# - task: Bash@3
|
||||||
set -e
|
# displayName: 'Test Pip Install'
|
||||||
export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
|
# inputs:
|
||||||
|
# targetType: inline
|
||||||
echo "--- Running origami_test.py ---"
|
# script: |
|
||||||
python3 $(Agent.BuildDirectory)/s/python/origami_test.py
|
# set -e
|
||||||
|
|
||||||
echo "--- Running origami_grid_test.py ---"
|
# echo "==================================================================="
|
||||||
python3 $(Agent.BuildDirectory)/s/python/origami_grid_test.py
|
# echo "Testing pip install workflow (pip install -e .)"
|
||||||
displayName: 'Run Python Binding Tests'
|
# echo "==================================================================="
|
||||||
condition: succeeded()
|
|
||||||
|
# # Set environment variables for pip install CMake build
|
||||||
|
# export ROCM_PATH=$(Agent.BuildDirectory)/rocm
|
||||||
|
# export CMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm:$(Agent.BuildDirectory)/vendor
|
||||||
|
# export CMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||||
|
|
||||||
|
# echo "ROCM_PATH: $ROCM_PATH"
|
||||||
|
# echo "CMAKE_PREFIX_PATH: $CMAKE_PREFIX_PATH"
|
||||||
|
# echo "CMAKE_CXX_COMPILER: $CMAKE_CXX_COMPILER"
|
||||||
|
# echo ""
|
||||||
|
|
||||||
|
# # Install from source directory
|
||||||
|
# cd "$(Agent.BuildDirectory)/s/python"
|
||||||
|
# pip install -e .
|
||||||
|
|
||||||
|
# # Verify import works
|
||||||
|
# echo ""
|
||||||
|
# echo "Verifying origami can be imported..."
|
||||||
|
# python3 -c "import origami; print('✓ Successfully imported origami')"
|
||||||
|
|
||||||
|
# # Run pytest on installed package
|
||||||
|
# echo ""
|
||||||
|
# echo "Running pytest tests..."
|
||||||
|
# python3 -m pytest tests/ -v -m "not slow" --tb=short
|
||||||
|
|
||||||
|
# echo ""
|
||||||
|
# echo "==================================================================="
|
||||||
|
# echo "Pip install test completed successfully"
|
||||||
|
# echo "==================================================================="
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||||
parameters:
|
parameters:
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ parameters:
|
|||||||
- python3-pip
|
- python3-pip
|
||||||
- protobuf-compiler
|
- protobuf-compiler
|
||||||
- libprotoc-dev
|
- libprotoc-dev
|
||||||
|
- libopencv-dev
|
||||||
- name: pipModules
|
- name: pipModules
|
||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
@@ -64,6 +65,7 @@ parameters:
|
|||||||
- MIVisionX
|
- MIVisionX
|
||||||
- rocm_smi_lib
|
- rocm_smi_lib
|
||||||
- rccl
|
- rccl
|
||||||
|
- rocAL
|
||||||
- rocALUTION
|
- rocALUTION
|
||||||
- rocBLAS
|
- rocBLAS
|
||||||
- rocDecode
|
- rocDecode
|
||||||
@@ -103,6 +105,7 @@ parameters:
|
|||||||
- MIVisionX
|
- MIVisionX
|
||||||
- rocm_smi_lib
|
- rocm_smi_lib
|
||||||
- rccl
|
- rccl
|
||||||
|
- rocAL
|
||||||
- rocALUTION
|
- rocALUTION
|
||||||
- rocBLAS
|
- rocBLAS
|
||||||
- rocDecode
|
- rocDecode
|
||||||
|
|||||||
@@ -36,7 +36,6 @@ Andrej
|
|||||||
Arb
|
Arb
|
||||||
Autocast
|
Autocast
|
||||||
autograd
|
autograd
|
||||||
Backported
|
|
||||||
BARs
|
BARs
|
||||||
BatchNorm
|
BatchNorm
|
||||||
BLAS
|
BLAS
|
||||||
@@ -204,11 +203,9 @@ GenAI
|
|||||||
GenZ
|
GenZ
|
||||||
GitHub
|
GitHub
|
||||||
Gitpod
|
Gitpod
|
||||||
hardcoded
|
|
||||||
HBM
|
HBM
|
||||||
HCA
|
HCA
|
||||||
HGX
|
HGX
|
||||||
HLO
|
|
||||||
HIPCC
|
HIPCC
|
||||||
hipDataType
|
hipDataType
|
||||||
HIPExtension
|
HIPExtension
|
||||||
@@ -336,7 +333,6 @@ MoEs
|
|||||||
Mooncake
|
Mooncake
|
||||||
Mpops
|
Mpops
|
||||||
Multicore
|
Multicore
|
||||||
multihost
|
|
||||||
Multithreaded
|
Multithreaded
|
||||||
mx
|
mx
|
||||||
MXFP
|
MXFP
|
||||||
@@ -1031,7 +1027,6 @@ uncacheable
|
|||||||
uncorrectable
|
uncorrectable
|
||||||
underoptimized
|
underoptimized
|
||||||
unhandled
|
unhandled
|
||||||
unfused
|
|
||||||
uninstallation
|
uninstallation
|
||||||
unmapped
|
unmapped
|
||||||
unsqueeze
|
unsqueeze
|
||||||
|
|||||||
22
RELEASE.md
22
RELEASE.md
@@ -270,26 +270,26 @@ The [ROCm examples repository](https://github.com/ROCm/rocm-examples) has been e
|
|||||||
:margin: auto 0 auto auto
|
:margin: auto 0 auto auto
|
||||||
:::{grid}
|
:::{grid}
|
||||||
:margin: auto 0 auto auto
|
:margin: auto 0 auto auto
|
||||||
* [hipBLASLt](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/hipBLASLt)
|
* [hipBLASLt](https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/)
|
||||||
* [hipSPARSE](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/hipSPARSE)
|
* [hipSPARSE](https://rocm.docs.amd.com/projects/hipSPARSE/en/latest/)
|
||||||
* [hipSPARSELt](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/hipSPARSELt)
|
* [hipSPARSELt](https://rocm.docs.amd.com/projects/hipSPARSELt/en/latest/)
|
||||||
* [hipTensor](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/hipTensor)
|
* [hipTensor](https://rocm.docs.amd.com/projects/hipTensor/en/latest/)
|
||||||
:::
|
:::
|
||||||
:::{grid}
|
:::{grid}
|
||||||
:margin: auto 0 auto auto
|
:margin: auto 0 auto auto
|
||||||
* [rocALUTION](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/rocALUTION)
|
* [rocALUTION](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/)
|
||||||
* [ROCprofiler-SDK](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/rocProfiler-SDK)
|
* [ROCprofiler-SDK](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/)
|
||||||
* [rocWMMA](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/rocWMMA)
|
* [rocWMMA](https://rocm.docs.amd.com/projects/rocWMMA/en/latest/)
|
||||||
:::
|
:::
|
||||||
::::
|
::::
|
||||||
|
|
||||||
Usage examples are now available for the following performance analysis tools:
|
Usage examples are now available for the following performance analysis tools:
|
||||||
|
|
||||||
* [ROCm Compute Profiler](https://github.com/ROCm/rocm-examples/tree/amd-staging/Tools/rocprof-compute)
|
* [ROCm Compute Profiler](https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/index.html)
|
||||||
* [ROCm Systems Profiler](https://github.com/ROCm/rocm-examples/tree/amd-staging/Tools/rocprof-systems)
|
* [ROCm Systems Profiler](https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/index.html)
|
||||||
* [rocprofv3](https://github.com/ROCm/rocm-examples/tree/amd-staging/Tools/rocprofv3)
|
* [rocprofv3](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/how-to/using-rocprofv3.html)
|
||||||
|
|
||||||
The complete source code for the [HIP Graph Tutorial](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Tutorials/graph_api) is also available as part of the ROCm examples.
|
The complete source code for the [HIP Graph Tutorial](https://rocm.docs.amd.com/projects/HIP/en/latest/tutorial/graph_api.html) is also available as part of the ROCm examples.
|
||||||
|
|
||||||
### ROCm documentation updates
|
### ROCm documentation updates
|
||||||
|
|
||||||
|
|||||||
@@ -269,33 +269,6 @@ For a complete and up-to-date list of JAX public modules (for example, ``jax.num
|
|||||||
JAX API modules are maintained by the JAX project and is subject to change.
|
JAX API modules are maintained by the JAX project and is subject to change.
|
||||||
Refer to the official Jax documentation for the most up-to-date information.
|
Refer to the official Jax documentation for the most up-to-date information.
|
||||||
|
|
||||||
Key features and enhancements for ROCm 7.1
|
|
||||||
===============================================================================
|
|
||||||
|
|
||||||
- Enabled compilation of multihost HLO runner Python bindings.
|
|
||||||
|
|
||||||
- Backported multihost HLO runner bindings and some related changes to
|
|
||||||
:code:`FunctionalHloRunner`.
|
|
||||||
|
|
||||||
- Added :code:`requirements_lock_3_12` to enable building for Python 3.12.
|
|
||||||
|
|
||||||
- Removed hardcoded NHWC convolution layout for ``fp16`` precision to address the performance drops for ``fp16`` precision on gfx12xx GPUs.
|
|
||||||
|
|
||||||
|
|
||||||
- ROCprofiler-SDK integration:
|
|
||||||
|
|
||||||
- Integrated ROCprofiler-SDK (v3) to XLA to improve profiling of GPU events,
|
|
||||||
support both time-based and step-based profiling.
|
|
||||||
|
|
||||||
- Added unit tests for :code:`rocm_collector` and :code:`rocm_tracer`.
|
|
||||||
|
|
||||||
- Added Triton unsupported conversion from ``f8E4M3FNUZ`` to ``fp16`` with
|
|
||||||
rounding mode.
|
|
||||||
|
|
||||||
- Introduced :code:`CudnnFusedConvDecomposer` to revert fused convolutions
|
|
||||||
when :code:`ConvAlgorithmPicker` fails to find a fused algorithm, and removed
|
|
||||||
unfused fallback paths from :code:`RocmFusedConvRunner`.
|
|
||||||
|
|
||||||
Key features and enhancements for ROCm 7.0
|
Key features and enhancements for ROCm 7.0
|
||||||
===============================================================================
|
===============================================================================
|
||||||
|
|
||||||
|
|||||||
@@ -268,3 +268,6 @@ html_context = {
|
|||||||
"granularity_type" : [('Coarse-grained', 'coarse-grained'), ('Fine-grained', 'fine-grained')],
|
"granularity_type" : [('Coarse-grained', 'coarse-grained'), ('Fine-grained', 'fine-grained')],
|
||||||
"scope_type" : [('Device', 'device'), ('System', 'system')]
|
"scope_type" : [('Device', 'device'), ('System', 'system')]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Disable figure and table numbering
|
||||||
|
numfig = False
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
- GitHub
|
- GitHub
|
||||||
|
|
||||||
* - :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`
|
* - :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`
|
||||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
- :doc:`Pytorch install <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- Wheels package
|
- Wheels package
|
||||||
@@ -35,7 +35,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
<a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`
|
* - :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`
|
||||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
- :doc:`TensorFlow install <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- Wheels package
|
- Wheels package
|
||||||
@@ -45,7 +45,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
<a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`
|
* - :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`
|
||||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/jax-install>`
|
- :doc:`JAX install <rocm-install-on-linux:install/3rd-party/jax-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- .. raw:: html
|
- .. raw:: html
|
||||||
@@ -53,7 +53,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
<a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>`
|
* - :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>`
|
||||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/verl-install>`
|
- :doc:`verl install <rocm-install-on-linux:install/3rd-party/verl-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- .. raw:: html
|
- .. raw:: html
|
||||||
@@ -61,7 +61,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
<a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`
|
* - :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`
|
||||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
|
- :doc:`Stanford Megatron-LM install <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- .. raw:: html
|
- .. raw:: html
|
||||||
@@ -69,7 +69,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
<a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>`
|
* - :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>`
|
||||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/dgl-install>`
|
- :doc:`DGL install <rocm-install-on-linux:install/3rd-party/dgl-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- .. raw:: html
|
- .. raw:: html
|
||||||
@@ -77,15 +77,24 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
<a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`
|
* - :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`
|
||||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/megablocks-install>`
|
- :doc:`Megablocks install <rocm-install-on-linux:install/3rd-party/megablocks-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- .. raw:: html
|
- .. raw:: html
|
||||||
|
|
||||||
<a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>`
|
||||||
|
- `Taichi install <https://rocm.docs.amd.com/projects/taichi/en/latest/install/taichi-install.html>`__
|
||||||
|
-
|
||||||
|
- Docker image
|
||||||
|
- Wheels package
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>`
|
* - :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>`
|
||||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/ray-install>`
|
- :doc:`Ray install <rocm-install-on-linux:install/3rd-party/ray-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- Wheels package
|
- Wheels package
|
||||||
@@ -95,7 +104,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
<a href="https://github.com/ROCm/ray"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/ray"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>`
|
* - :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>`
|
||||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
|
- :doc:`llama.cpp install <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- ROCm Base Docker image
|
- ROCm Base Docker image
|
||||||
@@ -104,7 +113,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
|||||||
<a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>
|
<a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
* - :doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>`
|
* - :doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>`
|
||||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/flashinfer-install>`
|
- :doc:`FlashInfer install <rocm-install-on-linux:install/3rd-party/flashinfer-install>`
|
||||||
-
|
-
|
||||||
- Docker image
|
- Docker image
|
||||||
- ROCm Base Docker image
|
- ROCm Base Docker image
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ Setting up the base implementation environment
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
amd-smi static --board
|
rocm-smi --showproductname
|
||||||
|
|
||||||
#. Check that your GPUs are available to PyTorch.
|
#. Check that your GPUs are available to PyTorch.
|
||||||
|
|
||||||
@@ -65,8 +65,8 @@ Setting up the base implementation environment
|
|||||||
|
|
||||||
.. tip::
|
.. tip::
|
||||||
|
|
||||||
During training and inference, you can check the memory usage by running the ``amd-smi`` command in your terminal.
|
During training and inference, you can check the memory usage by running the ``rocm-smi`` command in your terminal.
|
||||||
This tool helps you see which GPUs are involved.
|
This tool helps you see shows which GPUs are involved.
|
||||||
|
|
||||||
|
|
||||||
.. _fine-tuning-llms-multi-gpu-hugging-face-accelerate:
|
.. _fine-tuning-llms-multi-gpu-hugging-face-accelerate:
|
||||||
@@ -91,10 +91,10 @@ Now, it's important to adjust how you load the model. Add the ``device_map`` par
|
|||||||
|
|
||||||
...
|
...
|
||||||
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
|
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
|
||||||
|
|
||||||
# Load base model to GPU memory
|
# Load base model to GPU memory
|
||||||
base_model = AutoModelForCausalLM.from_pretrained(
|
base_model = AutoModelForCausalLM.from_pretrained(
|
||||||
base_model_name,
|
base_model_name,
|
||||||
device_map = "auto",
|
device_map = "auto",
|
||||||
trust_remote_code = True)
|
trust_remote_code = True)
|
||||||
...
|
...
|
||||||
@@ -130,7 +130,7 @@ After loading the model in this way, the model is fully ready to use the resourc
|
|||||||
torchtune for fine-tuning and inference
|
torchtune for fine-tuning and inference
|
||||||
=============================================
|
=============================================
|
||||||
|
|
||||||
`torchtune <https://pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-GPU
|
`torchtune <https://meta-pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-GPU
|
||||||
model fine-tuning and inference with LLMs.
|
model fine-tuning and inference with LLMs.
|
||||||
|
|
||||||
#. Install torchtune using pip.
|
#. Install torchtune using pip.
|
||||||
@@ -139,7 +139,7 @@ model fine-tuning and inference with LLMs.
|
|||||||
|
|
||||||
# Install torchtune with PyTorch release 2.2.2+
|
# Install torchtune with PyTorch release 2.2.2+
|
||||||
pip install torchtune
|
pip install torchtune
|
||||||
|
|
||||||
# To confirm that the package is installed correctly
|
# To confirm that the package is installed correctly
|
||||||
tune --help
|
tune --help
|
||||||
|
|
||||||
@@ -148,12 +148,12 @@ model fine-tuning and inference with LLMs.
|
|||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
usage: tune [-h] {download,ls,cp,run,validate} ...
|
usage: tune [-h] {download,ls,cp,run,validate} ...
|
||||||
|
|
||||||
Welcome to the TorchTune CLI!
|
Welcome to the TorchTune CLI!
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
|
|
||||||
subcommands:
|
subcommands:
|
||||||
{download,ls,cp,run,validate}
|
{download,ls,cp,run,validate}
|
||||||
|
|
||||||
@@ -194,11 +194,11 @@ model fine-tuning and inference with LLMs.
|
|||||||
apply_lora_to_output: False
|
apply_lora_to_output: False
|
||||||
lora_rank: 8
|
lora_rank: 8
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
|
|
||||||
tokenizer:
|
tokenizer:
|
||||||
_component_: torchtune.models.llama2.llama2_tokenizer
|
_component_: torchtune.models.llama2.llama2_tokenizer
|
||||||
path: /tmp/Llama-2-7b-hf/tokenizer.model
|
path: /tmp/Llama-2-7b-hf/tokenizer.model
|
||||||
|
|
||||||
# Dataset and sampler
|
# Dataset and sampler
|
||||||
dataset:
|
dataset:
|
||||||
_component_: torchtune.datasets.alpaca_cleaned_dataset
|
_component_: torchtune.datasets.alpaca_cleaned_dataset
|
||||||
|
|||||||
@@ -44,19 +44,20 @@ Setting up the base implementation environment
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
amd-smi static --board
|
rocm-smi --showproductname
|
||||||
|
|
||||||
Your output should look like this:
|
Your output should look like this:
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
GPU: 0
|
============================ ROCm System Management Interface ============================
|
||||||
BOARD:
|
====================================== Product Info ======================================
|
||||||
MODEL_NUMBER: 102-G39203-0B
|
GPU[0] : Card Series: AMD Instinct MI300X OAM
|
||||||
PRODUCT_SERIAL: PCB079220-1150
|
GPU[0] : Card model: 0x74a1
|
||||||
FRU_ID: 113-AMDG392030B04-100-300000097H
|
GPU[0] : Card vendor: Advanced Micro Devices, Inc. [AMD/ATI]
|
||||||
PRODUCT_NAME: AMD Instinct MI325 OAM
|
GPU[0] : Card SKU: MI3SRIOV
|
||||||
MANUFACTURER_NAME: AMD
|
==========================================================================================
|
||||||
|
================================== End of ROCm SMI Log ===================================
|
||||||
|
|
||||||
#. Check that your GPUs are available to PyTorch.
|
#. Check that your GPUs are available to PyTorch.
|
||||||
|
|
||||||
@@ -93,13 +94,13 @@ Setting up the base implementation environment
|
|||||||
pip install -r requirements-dev.txt
|
pip install -r requirements-dev.txt
|
||||||
cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S .
|
cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S .
|
||||||
python setup.py install
|
python setup.py install
|
||||||
|
|
||||||
# To leverage the SFTTrainer in TRL for model fine-tuning.
|
# To leverage the SFTTrainer in TRL for model fine-tuning.
|
||||||
pip install trl
|
pip install trl
|
||||||
|
|
||||||
# To leverage PEFT for efficiently adapting pre-trained language models .
|
# To leverage PEFT for efficiently adapting pre-trained language models .
|
||||||
pip install peft
|
pip install peft
|
||||||
|
|
||||||
# Install the other dependencies.
|
# Install the other dependencies.
|
||||||
pip install transformers datasets huggingface-hub scipy
|
pip install transformers datasets huggingface-hub scipy
|
||||||
|
|
||||||
@@ -131,7 +132,7 @@ Download the base model and fine-tuning dataset
|
|||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
You can also use the `NousResearch Llama-2-7b-chat-hf <https://huggingface.co/NousResearch/Llama-2-7b-chat-hf>`_
|
You can also use the `NousResearch Llama-2-7b-chat-hf <https://huggingface.co/NousResearch/Llama-2-7b-chat-hf>`_
|
||||||
as a substitute. It has the same model weights as the original.
|
as a substitute. It has the same model weights as the original.
|
||||||
|
|
||||||
#. Run the following code to load the base model and tokenizer.
|
#. Run the following code to load the base model and tokenizer.
|
||||||
@@ -140,14 +141,14 @@ Download the base model and fine-tuning dataset
|
|||||||
|
|
||||||
# Base model and tokenizer names.
|
# Base model and tokenizer names.
|
||||||
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
|
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
|
||||||
|
|
||||||
# Load base model to GPU memory.
|
# Load base model to GPU memory.
|
||||||
device = "cuda:0"
|
device = "cuda:0"
|
||||||
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code = True).to(device)
|
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code = True).to(device)
|
||||||
|
|
||||||
# Load tokenizer.
|
# Load tokenizer.
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
base_model_name,
|
base_model_name,
|
||||||
trust_remote_code = True)
|
trust_remote_code = True)
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
tokenizer.padding_side = "right"
|
tokenizer.padding_side = "right"
|
||||||
@@ -161,10 +162,10 @@ Download the base model and fine-tuning dataset
|
|||||||
# Dataset for fine-tuning.
|
# Dataset for fine-tuning.
|
||||||
training_dataset_name = "mlabonne/guanaco-llama2-1k"
|
training_dataset_name = "mlabonne/guanaco-llama2-1k"
|
||||||
training_dataset = load_dataset(training_dataset_name, split = "train")
|
training_dataset = load_dataset(training_dataset_name, split = "train")
|
||||||
|
|
||||||
# Check the data.
|
# Check the data.
|
||||||
print(training_dataset)
|
print(training_dataset)
|
||||||
|
|
||||||
# Dataset 11 is a QA sample in English.
|
# Dataset 11 is a QA sample in English.
|
||||||
print(training_dataset[11])
|
print(training_dataset[11])
|
||||||
|
|
||||||
@@ -251,8 +252,8 @@ Compare the number of trainable parameters and training time under the two diffe
|
|||||||
dataset_text_field = "text",
|
dataset_text_field = "text",
|
||||||
tokenizer = tokenizer,
|
tokenizer = tokenizer,
|
||||||
args = training_arguments
|
args = training_arguments
|
||||||
)
|
)
|
||||||
|
|
||||||
# Run the trainer.
|
# Run the trainer.
|
||||||
sft_trainer.train()
|
sft_trainer.train()
|
||||||
|
|
||||||
@@ -285,7 +286,7 @@ Compare the number of trainable parameters and training time under the two diffe
|
|||||||
if param.requires_grad:
|
if param.requires_grad:
|
||||||
trainable_params += param.numel()
|
trainable_params += param.numel()
|
||||||
print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}")
|
print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}")
|
||||||
|
|
||||||
sft_trainer.peft_config = None
|
sft_trainer.peft_config = None
|
||||||
print_trainable_parameters(sft_trainer.model)
|
print_trainable_parameters(sft_trainer.model)
|
||||||
|
|
||||||
@@ -308,8 +309,8 @@ Compare the number of trainable parameters and training time under the two diffe
|
|||||||
dataset_text_field = "text",
|
dataset_text_field = "text",
|
||||||
tokenizer = tokenizer,
|
tokenizer = tokenizer,
|
||||||
args = training_arguments
|
args = training_arguments
|
||||||
)
|
)
|
||||||
|
|
||||||
# Training.
|
# Training.
|
||||||
trainer_full.train()
|
trainer_full.train()
|
||||||
|
|
||||||
@@ -348,7 +349,7 @@ store, and load.
|
|||||||
|
|
||||||
# PEFT adapter name.
|
# PEFT adapter name.
|
||||||
adapter_name = "llama-2-7b-enhanced-adapter"
|
adapter_name = "llama-2-7b-enhanced-adapter"
|
||||||
|
|
||||||
# Save PEFT adapter.
|
# Save PEFT adapter.
|
||||||
sft_trainer.model.save_pretrained(adapter_name)
|
sft_trainer.model.save_pretrained(adapter_name)
|
||||||
|
|
||||||
@@ -358,21 +359,21 @@ store, and load.
|
|||||||
|
|
||||||
# Access adapter directory.
|
# Access adapter directory.
|
||||||
cd llama-2-7b-enhanced-adapter
|
cd llama-2-7b-enhanced-adapter
|
||||||
|
|
||||||
# List all adapter files.
|
# List all adapter files.
|
||||||
README.md adapter_config.json adapter_model.safetensors
|
README.md adapter_config.json adapter_model.safetensors
|
||||||
|
|
||||||
.. tab-item:: Saving a fully fine-tuned model
|
.. tab-item:: Saving a fully fine-tuned model
|
||||||
:sync: without
|
:sync: without
|
||||||
|
|
||||||
If you're not using LoRA and PEFT so there is no PEFT LoRA configuration used for training, use the following code
|
If you're not using LoRA and PEFT so there is no PEFT LoRA configuration used for training, use the following code
|
||||||
to save your fine-tuned model to your system.
|
to save your fine-tuned model to your system.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
# Fully fine-tuned model name.
|
# Fully fine-tuned model name.
|
||||||
new_model_name = "llama-2-7b-enhanced"
|
new_model_name = "llama-2-7b-enhanced"
|
||||||
|
|
||||||
# Save the fully fine-tuned model.
|
# Save the fully fine-tuned model.
|
||||||
full_trainer.model.save_pretrained(new_model_name)
|
full_trainer.model.save_pretrained(new_model_name)
|
||||||
|
|
||||||
@@ -382,7 +383,7 @@ store, and load.
|
|||||||
|
|
||||||
# Access new model directory.
|
# Access new model directory.
|
||||||
cd llama-2-7b-enhanced
|
cd llama-2-7b-enhanced
|
||||||
|
|
||||||
# List all model files.
|
# List all model files.
|
||||||
config.json model-00002-of-00006.safetensors model-00005-of-00006.safetensors
|
config.json model-00002-of-00006.safetensors model-00005-of-00006.safetensors
|
||||||
generation_config.json model-00003-of-00006.safetensors model-00006-of-00006.safetensors
|
generation_config.json model-00003-of-00006.safetensors model-00006-of-00006.safetensors
|
||||||
@@ -411,26 +412,26 @@ Let's look at achieving model inference using these types of models.
|
|||||||
|
|
||||||
.. tab-item:: Inference using PEFT adapters
|
.. tab-item:: Inference using PEFT adapters
|
||||||
|
|
||||||
To use PEFT adapters like a normal transformer model, you can run the generation by loading a base model along with PEFT
|
To use PEFT adapters like a normal transformer model, you can run the generation by loading a base model along with PEFT
|
||||||
adapters as follows.
|
adapters as follows.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
from peft import PeftModel
|
from peft import PeftModel
|
||||||
from transformers import AutoModelForCausalLM
|
from transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
# Set the path of the model or the name on Hugging face hub
|
# Set the path of the model or the name on Hugging face hub
|
||||||
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
|
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
|
||||||
|
|
||||||
# Set the path of the adapter
|
# Set the path of the adapter
|
||||||
adapter_name = "Llama-2-7b-enhanced-adpater"
|
adapter_name = "Llama-2-7b-enhanced-adpater"
|
||||||
|
|
||||||
# Load base model
|
# Load base model
|
||||||
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
|
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
|
||||||
|
|
||||||
# Adapt the base model with the adapter
|
# Adapt the base model with the adapter
|
||||||
new_model = PeftModel.from_pretrained(base_model, adapter_name)
|
new_model = PeftModel.from_pretrained(base_model, adapter_name)
|
||||||
|
|
||||||
# Then, run generation as the same with a normal model outlined in 2.1
|
# Then, run generation as the same with a normal model outlined in 2.1
|
||||||
|
|
||||||
The PEFT library provides a ``merge_and_unload`` method, which merges the adapter layers into the base model. This is
|
The PEFT library provides a ``merge_and_unload`` method, which merges the adapter layers into the base model. This is
|
||||||
@@ -438,13 +439,13 @@ Let's look at achieving model inference using these types of models.
|
|||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
# Load base model
|
# Load base model
|
||||||
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
|
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
|
||||||
|
|
||||||
# Adapt the base model with the adapter
|
# Adapt the base model with the adapter
|
||||||
new_model = PeftModel.from_pretrained(base_model, adapter_name)
|
new_model = PeftModel.from_pretrained(base_model, adapter_name)
|
||||||
|
|
||||||
# Merge adapter
|
# Merge adapter
|
||||||
model = model.merge_and_unload()
|
model = model.merge_and_unload()
|
||||||
|
|
||||||
# Save the merged model into local
|
# Save the merged model into local
|
||||||
@@ -460,25 +461,25 @@ Let's look at achieving model inference using these types of models.
|
|||||||
|
|
||||||
# Import relevant class for loading model and tokenizer
|
# Import relevant class for loading model and tokenizer
|
||||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||||
|
|
||||||
# Set the pre-trained model name on Hugging face hub
|
# Set the pre-trained model name on Hugging face hub
|
||||||
model_name = "meta-llama/Llama-2-7b-chat-hf"
|
model_name = "meta-llama/Llama-2-7b-chat-hf"
|
||||||
|
|
||||||
# Set device type
|
# Set device type
|
||||||
device = "cuda:0"
|
device = "cuda:0"
|
||||||
|
|
||||||
# Load model and tokenizer
|
# Load model and tokenizer
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
|
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|
||||||
# Input prompt encoding
|
# Input prompt encoding
|
||||||
query = "What is a large language model?"
|
query = "What is a large language model?"
|
||||||
inputs = tokenizer.encode(query, return_tensors="pt").to(device)
|
inputs = tokenizer.encode(query, return_tensors="pt").to(device)
|
||||||
|
|
||||||
# Token generation
|
# Token generation
|
||||||
outputs = model.generate(inputs)
|
outputs = model.generate(inputs)
|
||||||
|
|
||||||
# Outputs decoding
|
# Outputs decoding
|
||||||
print(tokenizer.decode(outputs[0]))
|
print(tokenizer.decode(outputs[0]))
|
||||||
|
|
||||||
In addition, pipelines from Transformers offer simple APIs to use pre-trained models for different tasks, including
|
In addition, pipelines from Transformers offer simple APIs to use pre-trained models for different tasks, including
|
||||||
@@ -489,14 +490,14 @@ Let's look at achieving model inference using these types of models.
|
|||||||
|
|
||||||
# Import relevant class for loading model and tokenizer
|
# Import relevant class for loading model and tokenizer
|
||||||
from transformers import pipeline
|
from transformers import pipeline
|
||||||
|
|
||||||
# Set the path of your model or the name on Hugging face hub
|
# Set the path of your model or the name on Hugging face hub
|
||||||
model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
|
model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
|
||||||
|
|
||||||
# Set pipeline
|
# Set pipeline
|
||||||
# A positive device value will run the model on associated CUDA device id
|
# A positive device value will run the model on associated CUDA device id
|
||||||
pipe = pipeline("text-generation", model=model_name_or_path, device=0)
|
pipe = pipeline("text-generation", model=model_name_or_path, device=0)
|
||||||
|
|
||||||
# Token generation
|
# Token generation
|
||||||
print(pipe("What is a large language model?")[0]["generated_text"])
|
print(pipe("What is a large language model?")[0]["generated_text"])
|
||||||
|
|
||||||
|
|||||||
@@ -31,16 +31,16 @@ in the Instinct documentation for more information.
|
|||||||
Hardware verification with ROCm
|
Hardware verification with ROCm
|
||||||
-------------------------------
|
-------------------------------
|
||||||
|
|
||||||
Use the command ``amd-smi set --perf-determinism 1900`` to set the max clock speed up to 1900 MHz
|
Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
|
||||||
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
|
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
|
||||||
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
|
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
|
||||||
You can restore this setting to its default value with the ``amd-smi reset --clocks`` command.
|
You can restore this setting to its default value with the ``rocm-smi -r`` command.
|
||||||
|
|
||||||
Run the command:
|
Run the command:
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
amd-smi set --perf-determinism 1900
|
rocm-smi --setperfdeterminism 1900
|
||||||
|
|
||||||
See `Hardware verfication for ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_
|
See `Hardware verfication for ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_
|
||||||
in the Instinct documentation for more information.
|
in the Instinct documentation for more information.
|
||||||
|
|||||||
@@ -108,16 +108,16 @@ for more information.
|
|||||||
Hardware verification with ROCm
|
Hardware verification with ROCm
|
||||||
-------------------------------
|
-------------------------------
|
||||||
|
|
||||||
Use the command ``amd-smi set --perf-determinism 1900`` to set the max clock speed up to 1900 MHz
|
Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
|
||||||
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
|
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
|
||||||
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
|
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
|
||||||
You can restore this setting to its default value with the ``amd-smi reset --clocks`` command.
|
You can restore this setting to its default value with the ``rocm-smi -r`` command.
|
||||||
|
|
||||||
Run the command:
|
Run the command:
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
amd-smi set --perf-determinism 1900
|
rocm-smi --setperfdeterminism 1900
|
||||||
|
|
||||||
See `Hardware verification with ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_ for more information.
|
See `Hardware verification with ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_ for more information.
|
||||||
|
|
||||||
@@ -248,7 +248,7 @@ Download the Docker image and required packages
|
|||||||
Checking out this specific commit is recommended for a stable and reproducible environment.
|
Checking out this specific commit is recommended for a stable and reproducible environment.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
git checkout bb93ccbfeae6363c67b361a97a27c74ab86e7e92
|
git checkout bb93ccbfeae6363c67b361a97a27c74ab86e7e92
|
||||||
|
|
||||||
Prepare training datasets
|
Prepare training datasets
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
GPU hardware specifications
|
GPU hardware specifications
|
||||||
===========================================
|
===========================================
|
||||||
|
|
||||||
The following tables provide an overview of the hardware specifications for AMD Instinct™ GPUs, AMD Radeon™ PRO and Radeon™ GPUs, and AMD Ryzen™ APUs.
|
The following tables provide an overview of the hardware specifications for AMD Instinct™ GPUs, and AMD Radeon™ PRO and Radeon™ GPUs.
|
||||||
|
|
||||||
For more information about ROCm hardware compatibility, see the ROCm `Compatibility matrix <https://rocm.docs.amd.com/en/latest/compatibility/compatibility-matrix.html>`_.
|
For more information about ROCm hardware compatibility, see the ROCm `Compatibility matrix <https://rocm.docs.amd.com/en/latest/compatibility/compatibility-matrix.html>`_.
|
||||||
|
|
||||||
@@ -18,7 +18,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
|||||||
:name: instinct-arch-spec-table
|
:name: instinct-arch-spec-table
|
||||||
|
|
||||||
*
|
*
|
||||||
- Name
|
- Model
|
||||||
- Architecture
|
- Architecture
|
||||||
- LLVM target name
|
- LLVM target name
|
||||||
- VRAM (GiB)
|
- VRAM (GiB)
|
||||||
@@ -297,7 +297,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
|||||||
:name: radeon-pro-arch-spec-table
|
:name: radeon-pro-arch-spec-table
|
||||||
|
|
||||||
*
|
*
|
||||||
- Name
|
- Model
|
||||||
- Architecture
|
- Architecture
|
||||||
- LLVM target name
|
- LLVM target name
|
||||||
|
|
||||||
@@ -539,7 +539,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
|||||||
:name: radeon-arch-spec-table
|
:name: radeon-arch-spec-table
|
||||||
|
|
||||||
*
|
*
|
||||||
- Name
|
- Model
|
||||||
- Architecture
|
- Architecture
|
||||||
- LLVM target name
|
- LLVM target name
|
||||||
- VRAM (GiB)
|
- VRAM (GiB)
|
||||||
@@ -953,127 +953,6 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
|||||||
- 9
|
- 9
|
||||||
- 0
|
- 0
|
||||||
|
|
||||||
.. tab-item:: AMD Ryzen APUs
|
|
||||||
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
:name: ryzen-arch-spec-table
|
|
||||||
|
|
||||||
*
|
|
||||||
- Name
|
|
||||||
- Graphics model
|
|
||||||
- Architecture
|
|
||||||
- LLVM target name
|
|
||||||
- VRAM (GiB)
|
|
||||||
- Compute Units
|
|
||||||
- Wavefront Size
|
|
||||||
- LDS (KiB)
|
|
||||||
- Infinity Cache (MiB)
|
|
||||||
- L2 Cache (MiB)
|
|
||||||
- Graphics L1 Cache (KiB)
|
|
||||||
- L0 Vector Cache (KiB)
|
|
||||||
- L0 Scalar Cache (KiB)
|
|
||||||
- L0 Instruction Cache (KiB)
|
|
||||||
- VGPR File (KiB)
|
|
||||||
- SGPR File (KiB)
|
|
||||||
- GFXIP Major version
|
|
||||||
- GFXIP Minor version
|
|
||||||
*
|
|
||||||
- AMD Ryzen 7 7840U
|
|
||||||
- Radeon 780M
|
|
||||||
- RDNA3
|
|
||||||
- gfx1103
|
|
||||||
- Dynamic + carveout
|
|
||||||
- 12
|
|
||||||
- 32 or 64
|
|
||||||
- 128
|
|
||||||
- N/A
|
|
||||||
- 2
|
|
||||||
- 256
|
|
||||||
- 32
|
|
||||||
- 16
|
|
||||||
- 32
|
|
||||||
- 512
|
|
||||||
- 32
|
|
||||||
- 11
|
|
||||||
- 0
|
|
||||||
*
|
|
||||||
- AMD Ryzen 9 270
|
|
||||||
- Radeon 780M
|
|
||||||
- RDNA3
|
|
||||||
- gfx1103
|
|
||||||
- Dynamic + carveout
|
|
||||||
- 12
|
|
||||||
- 32 or 64
|
|
||||||
- 128
|
|
||||||
- N/A
|
|
||||||
- 2
|
|
||||||
- 256
|
|
||||||
- 32
|
|
||||||
- 16
|
|
||||||
- 32
|
|
||||||
- 512
|
|
||||||
- 32
|
|
||||||
- 11
|
|
||||||
- 0
|
|
||||||
*
|
|
||||||
- AMD Ryzen AI 9 HX 375
|
|
||||||
- Radeon 890M
|
|
||||||
- RDNA3.5
|
|
||||||
- gfx1150
|
|
||||||
- Dynamic + carveout
|
|
||||||
- 16
|
|
||||||
- 32 or 64
|
|
||||||
- 128
|
|
||||||
- N/A
|
|
||||||
- 2
|
|
||||||
- 256
|
|
||||||
- 32
|
|
||||||
- 16
|
|
||||||
- 32
|
|
||||||
- 512
|
|
||||||
- 32
|
|
||||||
- 11
|
|
||||||
- 5
|
|
||||||
*
|
|
||||||
- AMD Ryzen AI Max+ PRO 395
|
|
||||||
- Radeon 8060S
|
|
||||||
- RDNA3.5
|
|
||||||
- gfx1151
|
|
||||||
- Dynamic + carveout
|
|
||||||
- 40
|
|
||||||
- 32 or 64
|
|
||||||
- 128
|
|
||||||
- 32
|
|
||||||
- 2
|
|
||||||
- 256
|
|
||||||
- 32
|
|
||||||
- 16
|
|
||||||
- 32
|
|
||||||
- 768
|
|
||||||
- 32
|
|
||||||
- 11
|
|
||||||
- 5
|
|
||||||
*
|
|
||||||
- AMD Ryzen Al 7 350
|
|
||||||
- Radeon 860M
|
|
||||||
- RDNA3.5
|
|
||||||
- gfx1152
|
|
||||||
- Dynamic + carveout
|
|
||||||
- 8
|
|
||||||
- 32 or 64
|
|
||||||
- 128
|
|
||||||
- N/A
|
|
||||||
- 1
|
|
||||||
- 256
|
|
||||||
- 32
|
|
||||||
- 16
|
|
||||||
- 32
|
|
||||||
- 512
|
|
||||||
- 32
|
|
||||||
- 11
|
|
||||||
- 5
|
|
||||||
|
|
||||||
Glossary
|
Glossary
|
||||||
========
|
========
|
||||||
|
|
||||||
|
|||||||
@@ -29,25 +29,27 @@ subtrees:
|
|||||||
title: Deep learning frameworks
|
title: Deep learning frameworks
|
||||||
subtrees:
|
subtrees:
|
||||||
- entries:
|
- entries:
|
||||||
- file: compatibility/ml-compatibility/pytorch-compatibility.rst
|
- file: compatibility/ml-compatibility/pytorch-compatibility
|
||||||
title: PyTorch compatibility
|
title: PyTorch compatibility
|
||||||
- file: compatibility/ml-compatibility/tensorflow-compatibility.rst
|
- file: compatibility/ml-compatibility/tensorflow-compatibility
|
||||||
title: TensorFlow compatibility
|
title: TensorFlow compatibility
|
||||||
- file: compatibility/ml-compatibility/jax-compatibility.rst
|
- file: compatibility/ml-compatibility/jax-compatibility
|
||||||
title: JAX compatibility
|
title: JAX compatibility
|
||||||
- file: compatibility/ml-compatibility/verl-compatibility.rst
|
- file: compatibility/ml-compatibility/verl-compatibility
|
||||||
title: verl compatibility
|
title: verl compatibility
|
||||||
- file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
|
- file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility
|
||||||
title: Stanford Megatron-LM compatibility
|
title: Stanford Megatron-LM compatibility
|
||||||
- file: compatibility/ml-compatibility/dgl-compatibility.rst
|
- file: compatibility/ml-compatibility/dgl-compatibility
|
||||||
title: DGL compatibility
|
title: DGL compatibility
|
||||||
- file: compatibility/ml-compatibility/megablocks-compatibility.rst
|
- file: compatibility/ml-compatibility/megablocks-compatibility
|
||||||
title: Megablocks compatibility
|
title: Megablocks compatibility
|
||||||
- file: compatibility/ml-compatibility/ray-compatibility.rst
|
- file: compatibility/ml-compatibility/taichi-compatibility
|
||||||
|
title: Taichi compatibility
|
||||||
|
- file: compatibility/ml-compatibility/ray-compatibility
|
||||||
title: Ray compatibility
|
title: Ray compatibility
|
||||||
- file: compatibility/ml-compatibility/llama-cpp-compatibility.rst
|
- file: compatibility/ml-compatibility/llama-cpp-compatibility
|
||||||
title: llama.cpp compatibility
|
title: llama.cpp compatibility
|
||||||
- file: compatibility/ml-compatibility/flashinfer-compatibility.rst
|
- file: compatibility/ml-compatibility/flashinfer-compatibility
|
||||||
title: FlashInfer compatibility
|
title: FlashInfer compatibility
|
||||||
- file: how-to/build-rocm.rst
|
- file: how-to/build-rocm.rst
|
||||||
title: Build ROCm from source
|
title: Build ROCm from source
|
||||||
@@ -75,14 +77,8 @@ subtrees:
|
|||||||
- entries:
|
- entries:
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
|
||||||
title: Train a model with Primus and Megatron-LM
|
title: Train a model with Primus and Megatron-LM
|
||||||
entries:
|
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
|
|
||||||
title: Train a model with Megatron-LM
|
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
|
||||||
title: Train a model with Primus and PyTorch
|
title: Train a model with Primus and PyTorch
|
||||||
entries:
|
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
|
|
||||||
title: Train a model with PyTorch
|
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
|
||||||
title: Train a model with JAX MaxText
|
title: Train a model with JAX MaxText
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry
|
- file: how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry
|
||||||
@@ -121,8 +117,6 @@ subtrees:
|
|||||||
title: SGLang inference performance testing
|
title: SGLang inference performance testing
|
||||||
- file: how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
|
- file: how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
|
||||||
title: SGLang distributed inference with Mooncake
|
title: SGLang distributed inference with Mooncake
|
||||||
- file: how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
|
|
||||||
title: xDiT diffusion inference
|
|
||||||
- file: how-to/rocm-for-ai/inference/deploy-your-model.rst
|
- file: how-to/rocm-for-ai/inference/deploy-your-model.rst
|
||||||
title: Deploy your model
|
title: Deploy your model
|
||||||
|
|
||||||
@@ -140,8 +134,6 @@ subtrees:
|
|||||||
title: Profile and debug
|
title: Profile and debug
|
||||||
- file: how-to/rocm-for-ai/inference-optimization/workload.rst
|
- file: how-to/rocm-for-ai/inference-optimization/workload.rst
|
||||||
title: Workload optimization
|
title: Workload optimization
|
||||||
- file: how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
|
|
||||||
title: vLLM V1 performance optimization
|
|
||||||
|
|
||||||
- url: https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/
|
- url: https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/
|
||||||
title: AI tutorials
|
title: AI tutorials
|
||||||
@@ -188,7 +180,7 @@ subtrees:
|
|||||||
- file: conceptual/gpu-arch/mi300-mi200-performance-counters.rst
|
- file: conceptual/gpu-arch/mi300-mi200-performance-counters.rst
|
||||||
title: MI300 and MI200 performance counters
|
title: MI300 and MI200 performance counters
|
||||||
- file: conceptual/gpu-arch/mi350-performance-counters.rst
|
- file: conceptual/gpu-arch/mi350-performance-counters.rst
|
||||||
title: MI350 Series performance counters
|
title: MI350 series performance counters
|
||||||
- file: conceptual/gpu-arch/mi250.md
|
- file: conceptual/gpu-arch/mi250.md
|
||||||
title: MI250 microarchitecture
|
title: MI250 microarchitecture
|
||||||
subtrees:
|
subtrees:
|
||||||
@@ -222,8 +214,6 @@ subtrees:
|
|||||||
title: ROCm tools, compilers, and runtimes
|
title: ROCm tools, compilers, and runtimes
|
||||||
- file: reference/gpu-arch-specs.rst
|
- file: reference/gpu-arch-specs.rst
|
||||||
- file: reference/gpu-atomics-operation.rst
|
- file: reference/gpu-atomics-operation.rst
|
||||||
- file: reference/env-variables.rst
|
|
||||||
title: Environment variables
|
|
||||||
- file: reference/precision-support.rst
|
- file: reference/precision-support.rst
|
||||||
title: Data types and precision support
|
title: Data types and precision support
|
||||||
- file: reference/graph-safe-support.rst
|
- file: reference/graph-safe-support.rst
|
||||||
|
|||||||
@@ -123,8 +123,7 @@ Performance
|
|||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
`ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is a tool for visualizing and analyzing GPU thread trace data collected using :doc:`rocprofv3 <rocprofiler-sdk:index>`.
|
`ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is a tool for visualizing and analyzing GPU thread trace data collected using :doc:`rocprofv3 <rocprofiler-sdk:index>`. Note that `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is in an early access state. Running production workloads is not recommended.
|
||||||
Note that `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is in an early access state. Running production workloads is not recommended.
|
|
||||||
|
|
||||||
Development
|
Development
|
||||||
^^^^^^^^^^^
|
^^^^^^^^^^^
|
||||||
|
|||||||
Reference in New Issue
Block a user