mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-10 23:28:03 -05:00
Compare commits
5 Commits
docs/7.0-a
...
docs/7.0-a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
600852d743 | ||
|
|
f16cfd81c7 | ||
|
|
fbca8fd5e8 | ||
|
|
75216b8fcc | ||
|
|
a94a616f42 |
@@ -1,4 +1,9 @@
|
||||
SGLang
|
||||
IPC
|
||||
SPIR
|
||||
VFS
|
||||
builtins
|
||||
crosslane
|
||||
frontend
|
||||
Datacenter
|
||||
GST
|
||||
IET
|
||||
|
||||
@@ -27,8 +27,8 @@ project = "ROCm Documentation"
|
||||
project_path = os.path.abspath(".").replace("\\", "/")
|
||||
author = "Advanced Micro Devices, Inc."
|
||||
copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
|
||||
version = "7.0 Alpha"
|
||||
release = "7.0 Alpha"
|
||||
version = "7.0 Alpha 2"
|
||||
release = "7.0 Alpha 2"
|
||||
setting_all_article_info = True
|
||||
all_article_info_os = ["linux", "windows"]
|
||||
all_article_info_author = ""
|
||||
@@ -73,7 +73,7 @@ html_static_path = ["sphinx/static/css", "sphinx/static/js"]
|
||||
html_css_files = ["rocm_custom.css", "rocm_rn.css"]
|
||||
html_js_files = ["preview-version-list.js"]
|
||||
|
||||
html_title = "ROCm 7.0 Alpha documentation"
|
||||
html_title = "ROCm 7.0 Alpha 2 documentation"
|
||||
|
||||
html_theme_options = {"link_main_doc": False}
|
||||
|
||||
|
||||
@@ -39,9 +39,3 @@ configurations.
|
||||
* :doc:`pre-training-megatron-lm-llama-3-8b`
|
||||
|
||||
* :doc:`pre-training-torchtitan-llama-3-70b`
|
||||
|
||||
.. grid-item-card:: Inference
|
||||
|
||||
* :doc:`inference-sglang-deepseek-r1-fp4`
|
||||
|
||||
* :doc:`inference-vllm-llama-3.1-405b-fp4`
|
||||
|
||||
@@ -1,113 +0,0 @@
|
||||
**************************************************
|
||||
Benchmarking DeepSeek R1 FP4 inference with SGLang
|
||||
**************************************************
|
||||
|
||||
.. note::
|
||||
|
||||
For the latest iteration of AI training and inference performance for ROCm
|
||||
7.0, see `Infinity Hub
|
||||
<https://www.amd.com/en/developer/resources/infinity-hub.html#q=ROCm%207>`__
|
||||
and the `ROCm 7.0 AI training and inference performance
|
||||
<https://rocm.docs.amd.com/en/docs-7.0-docker/benchmark-docker/index.html>`__
|
||||
documentation.
|
||||
|
||||
This section provides instructions to test the inference performance of DeepSeek R1
|
||||
with FP4 precision via the SGLang serving framework.
|
||||
The accompanying Docker image integrates the ROCm 7.0 Alpha with SGLang, and is
|
||||
tailored for AMD Instinct MI355X and MI350X accelerators. This
|
||||
benchmark does not support other accelerators.
|
||||
|
||||
Follow these steps to pull the required image, spin up the container with the
|
||||
appropriate options, download the model, and run the throughput test.
|
||||
|
||||
1. Pull the `Docker image <https://hub.docker.com/layers/rocm/7.0-preview/rocm7.0_preview_ubuntu_22.04_sglang_0.4.6.post4_mi35X_alpha/images/sha256-3095b0179c31bb892799c3b53e73f202abbd66409903cb990f48d0fdd3b1a1fe>`__.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_sglang_0.4.6.post4_mi35X_alpha
|
||||
|
||||
2. Download the model.
|
||||
|
||||
.. note::
|
||||
|
||||
This model uses microscaling 4-bit floating point (MXFP4) quantization
|
||||
via `AMD Quark <https://quark.docs.amd.com/latest/>`_ for efficient
|
||||
inference on AMD accelerators. See model card on Hugging Face at
|
||||
`DeepSeek-R1-MXFP4-Preview
|
||||
<https://huggingface.co/amd/DeepSeek-R1-MXFP4-Preview>`__.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
pip install huggingface_hub[cli] hf_transfer hf_xet
|
||||
HF_HUB_ENABLE_HF_TRANSFER=1 \
|
||||
HF_HOME=/data/huggingface-cache \
|
||||
HF_TOKEN="<HF_TOKEN>" \
|
||||
huggingface-cli download amd/DeepSeek-R1-MXFP4-Preview --exclude "original/*"
|
||||
|
||||
3. Run the inference benchmark.
|
||||
|
||||
Start the container using the following command.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it --rm --ipc=host --network host --security-opt seccomp=unconfined \
|
||||
--device=/dev/kfd --device=/dev/dri \
|
||||
-v /data:/data \
|
||||
-e HF_HOME=/data/huggingface-cache \
|
||||
-e HF_HUB_OFFLINE=1 \
|
||||
-e NCCL_MIN_NCHANNELS=112 \
|
||||
-e SGLANG_JPVILLAM_UPCAST_LINEAR=0 \
|
||||
-e TRITON_ALLOW_NON_CONSTEXPR_GLOBALS=1 \
|
||||
-e AMDGCN_USE_BUFFER_OPS=1 \
|
||||
-e TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 \
|
||||
-e TRITON_HIP_ASYNC_FAST_SWIZZLE=1 \
|
||||
-e TRITON_HIP_USE_ASYNC_COPY=1 \
|
||||
-e TRITON_HIP_USE_BLOCK_PINGPONG=0 \
|
||||
-e SGLANG_MXFP4_WEIGHT=0 \
|
||||
-e SGLANG_AITER_MOE=1 \
|
||||
-e SGLANG_AITER_NORM=1 \
|
||||
-e AITER_GEMM=1 \
|
||||
-e AITER_MLA_DECODE=1 \
|
||||
-e AITER_PREFILL=1 \
|
||||
-e AITER_ROPE=1 \
|
||||
rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_sglang_0.4.6.post4_mi35X_alpha
|
||||
|
||||
Start the server.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path amd/DeepSeek-R1-MXFP4-Preview \
|
||||
--host localhost \
|
||||
--port 8000 \
|
||||
--log-requests \
|
||||
--tensor-parallel-size 8 \
|
||||
--trust-remote-code \
|
||||
--chunked-prefill-size 131072 \
|
||||
--mem-fraction-static 0.95 \
|
||||
--disable-radix-cache \
|
||||
--n-share-experts-fusion 8 \
|
||||
--num-continuous-decode-steps 4 \
|
||||
--enable-torch-compile \
|
||||
--torch-compile-max-bs 64
|
||||
|
||||
Run the benchmark with the following options.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
input_tokens=3200
|
||||
output_tokens=800
|
||||
max_concurrency=1
|
||||
num_prompts=$((max_concurrency*8))
|
||||
|
||||
python3 -m sglang.bench_serving \
|
||||
--host localhost \
|
||||
--port 8000 \
|
||||
--model amd/DeepSeek-R1-MXFP4-Preview \
|
||||
--dataset-name random \
|
||||
--random-input ${input_tokens} \
|
||||
--random-output ${output_tokens} \
|
||||
--random-range-ratio 1.0 \
|
||||
--max-concurrency ${max_concurrency} \
|
||||
--num-prompt ${num_prompts}
|
||||
|
||||
@@ -1,103 +0,0 @@
|
||||
***************************************************
|
||||
Benchmarking Llama 3.1 405B FP4 inference with vLLM
|
||||
***************************************************
|
||||
|
||||
.. note::
|
||||
|
||||
For the latest iteration of AI training and inference performance for ROCm
|
||||
7.0, see `Infinity Hub
|
||||
<https://www.amd.com/en/developer/resources/infinity-hub.html#q=ROCm%207>`__
|
||||
and the `ROCm 7.0 AI training and inference performance
|
||||
<https://rocm.docs.amd.com/en/docs-7.0-docker/benchmark-docker/index.html>`__
|
||||
documentation.
|
||||
|
||||
This section provides instructions to test the inference performance of Llama
|
||||
3.1 405B on the vLLM inference engine. The accompanying Docker image integrates
|
||||
the ROCm 7.0 Alpha with vLLM, and is tailored for AMD Instinct
|
||||
MI355X and MI350X accelerators. This benchmark does not support other
|
||||
accelerators.
|
||||
|
||||
Follow these steps to pull the required image, spin up the container with the
|
||||
appropriate options, download the model, and run the throughput test.
|
||||
|
||||
1. Pull the `Docker image <https://hub.docker.com/layers/rocm/7.0-preview/rocm7.0_preview_ubuntu_22.04_vllm_0.9.1_mi35x_alpha/images/sha256-3ab87887724b75e5d1d2306a04afae853849ec3aabf8f9ee6335d766b3d0eaa0>`__.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.9.1_mi35x_alpha
|
||||
|
||||
2. Download the model.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
pip install huggingface_hub[cli] hf_transfer hf_xet
|
||||
HF_HUB_ENABLE_HF_TRANSFER=1 \
|
||||
HF_HOME=/data/huggingface-cache \
|
||||
HF_TOKEN="<HF_TOKEN>" \
|
||||
huggingface-cli download amd/Llama-3.1-405B-Instruct-MXFP4-Preview --exclude "original/*"
|
||||
|
||||
.. note::
|
||||
|
||||
This model uses microscaling 4-bit floating point (MXFP4) quantization
|
||||
via `AMD Quark <https://quark.docs.amd.com/latest/>`_ for efficient
|
||||
inference on AMD accelerators. See the model card on Hugging Face at
|
||||
`amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
<https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview>`__.
|
||||
|
||||
3. Run the inference benchmark.
|
||||
|
||||
Start the container using the following command.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--ipc=host \
|
||||
--network=host \
|
||||
--privileged \
|
||||
--cap-add=CAP_SYS_ADMIN \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
-e USE_FASTSAFETENSOR=1 \
|
||||
-e SAFETENSORS_FAST_GPU=1 \
|
||||
-e VLLM_TRITON_FP4_GEMM_USE_ASM=1 \
|
||||
-e VLLM_USE_AITER_TRITON_ROPE=1 \
|
||||
-e VLLM_USE_AITER_TRITON_SILU_MUL=1 \
|
||||
-e TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 \
|
||||
-e AMDGCN_USE_BUFFER_OPS=1 \
|
||||
-e TRITON_HIP_USE_ASYNC_COPY=1 \
|
||||
-e TRITON_HIP_USE_BLOCK_PINGPONG=1 \
|
||||
-e TRITON_HIP_ASYNC_FAST_SWIZZLE=1 \
|
||||
-e TRITON_HIP_PRESHUFFLE_SCALES=1 \
|
||||
-e VLLM_ROCM_USE_AITER=1 \
|
||||
-e VLLM_ROCM_USE_AITER_PAGED_ATTN=1 \
|
||||
-e VLLM_ROCM_USE_AITER_RMSNORM=1 \
|
||||
-e VLLM_USE_V1=0 \
|
||||
rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.9.1_mi35x_alpha
|
||||
|
||||
Run the ``benchmark_throughput.py`` script.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
input_tokens=128
|
||||
output_tokens=128
|
||||
num_prompts=16384
|
||||
max_num_seqs=1024
|
||||
max_num_batched_tokens=16384
|
||||
max_model_len=8192
|
||||
|
||||
python3 /app/vllm/benchmarks/benchmark_throughput.py \
|
||||
--model amd/Llama-3.1-405B-Instruct-MXFP4-Preview \
|
||||
--input-len ${input_tokens} \
|
||||
--output-len ${output_tokens} \
|
||||
--tensor-parallel-size 1 \
|
||||
--num-prompts ${num_prompts} \
|
||||
--dtype auto \
|
||||
--gpu-memory-utilization 0.98 \
|
||||
--max-model-len ${max_model_len} \
|
||||
--distributed-executor-backend mp \
|
||||
--max-num-batched-tokens ${max_num_batched_tokens} \
|
||||
--no-enable-prefix-caching \
|
||||
--max-num-seqs ${max_num_seqs} \
|
||||
--disable-detokenize \
|
||||
--kv-cache-dtype fp8 \
|
||||
--num-scheduler-steps 128
|
||||
@@ -1,30 +0,0 @@
|
||||
****************************
|
||||
Benchmarking model inference
|
||||
****************************
|
||||
|
||||
.. note::
|
||||
|
||||
For the latest iteration of AI training and inference performance for ROCm
|
||||
7.0, see `Infinity Hub
|
||||
<https://www.amd.com/en/developer/resources/infinity-hub.html#q=ROCm%207>`__
|
||||
and the `ROCm 7.0 AI training and inference performance
|
||||
<https://rocm.docs.amd.com/en/docs-7.0-docker/benchmark-docker/index.html>`__
|
||||
documentation.
|
||||
|
||||
AI inference is a process of deploying a trained machine learning model to make
|
||||
predictions or classifications on new data. By leveraging the ROCm platform's
|
||||
capabilities, you can harness the power of high-performance computing and
|
||||
efficient resource management to run inference workloads, leading to faster
|
||||
predictions and classifications on real-time data.
|
||||
|
||||
AMD provides prebuilt, optimized environments for validating the inference
|
||||
performance of popular models on AMD Instinct™ MI355X and MI350X accelerators.
|
||||
See the following sections for instructions.
|
||||
|
||||
.. grid::
|
||||
|
||||
.. grid-item-card:: Inference benchmarking
|
||||
|
||||
* :doc:`inference-sglang-deepseek-r1-fp4`
|
||||
|
||||
* :doc:`inference-vllm-llama-3.1-405b-fp4`
|
||||
@@ -21,7 +21,7 @@ benchmark does not support other accelerators.
|
||||
Follow these steps to pull the required image, spin up the container with the
|
||||
appropriate options, download the model, and run the throughput test.
|
||||
|
||||
1. Pull the `Docker image <https://hub.docker.com/layers/rocm/7.0-preview/rocm7.0_preview_pytorch_training_mi35X_alpha/images/sha256-734c76d4d68ab23b47f4bf012863793df11f83714c35683fb5c15bc48d0a6dd2>`__.
|
||||
1. Pull the Docker image.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
@@ -69,16 +69,15 @@ appropriate options, download the model, and run the throughput test.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
bash examples/llama/train_llama3.sh \
|
||||
bash examples/llama/train_llama3.sh
|
||||
TEE_OUTPUT=1 \
|
||||
MBS=4 \
|
||||
BS=512 \
|
||||
BS=256 \
|
||||
TP=1 \
|
||||
TE_FP8=0 \
|
||||
SEQ_LENGTH=8192 \
|
||||
MODEL_SIZE=8 \
|
||||
TOTAL_ITERS=10 \
|
||||
GEMM_TUNING=1
|
||||
TOTAL_ITERS=10
|
||||
|
||||
.. note::
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ benchmark does not support other accelerators.
|
||||
Follow these steps to pull the required image, spin up the container with the
|
||||
appropriate options, download the model, and run the throughput test.
|
||||
|
||||
1. Pull the `Docker image <https://hub.docker.com/layers/rocm/7.0-preview/rocm7.0_preview_pytorch_training_mi35X_alpha/images/sha256-734c76d4d68ab23b47f4bf012863793df11f83714c35683fb5c15bc48d0a6dd2>`__.
|
||||
1. Pull the Docker image.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
|
||||
@@ -22,11 +22,6 @@ the powerful parallel processing capabilities and efficient compute resource
|
||||
management, significantly improving training time and overall performance in
|
||||
machine learning applications.
|
||||
|
||||
AMD provides ready-to-use Docker images for MI355X and MI350X series
|
||||
accelerators containing essential software components and optimizations to
|
||||
accelerate and benchmark training workloads for popular models.
|
||||
See the following sections for instructions.
|
||||
|
||||
.. grid:: 1
|
||||
|
||||
.. grid-item-card:: Training benchmarking
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
---
|
||||
myst:
|
||||
html_meta:
|
||||
"description": "AMD ROCm 7.0 Alpha documentation"
|
||||
"description": "AMD ROCm 7.0 Alpha 2 documentation"
|
||||
"keywords": "Radeon, open, compute, platform, install, how, conceptual, reference, home, docs"
|
||||
---
|
||||
|
||||
# AMD ROCm 7.0 Alpha documentation
|
||||
# AMD ROCm 7.0 Alpha 2 documentation
|
||||
|
||||
AMD ROCm is an open-source software platform optimized to extract HPC and AI
|
||||
workload performance from AMD Instinct™ accelerators while maintaining
|
||||
compatibility with industry software frameworks.
|
||||
|
||||
This documentation is intended to provide early access information about the ROCm
|
||||
software Alpha release. The preview release provides early access to new
|
||||
features under development for testing for users to provide feedback.
|
||||
This documentation provides early access information about ROCm 7.0
|
||||
Alpha 2. This preview release provides access to new
|
||||
features under development for testing so users can provide feedback.
|
||||
It is not recommended for production use.
|
||||
|
||||
```{note}
|
||||
@@ -23,5 +23,5 @@ For a complete list of ROCm 7.0 preview releases, see the [ROCm 7.0 preview rele
|
||||
|
||||
The documentation includes:
|
||||
|
||||
- [ROCm 7.0 Alpha release notes](release.rst) with feature details and support matrix
|
||||
- [Installation instructions](install/index.rst) for the ROCm 7.0 Alpha and the Instinct Driver
|
||||
- [ROCm 7.0 Alpha 2 release notes](release.rst) with feature details and support matrix
|
||||
- [Installation instructions](install/index.rst) for the ROCm 7.0 Alpha 2 and the Instinct Driver
|
||||
|
||||
@@ -10,18 +10,18 @@ ROCm 7.0 Alpha installation instructions
|
||||
The ROCm 7.0 Alpha must be installed using your Linux distribution's native
|
||||
package manager. This release supports specific hardware and software
|
||||
configurations -- before installing, see the :ref:`supported OSes and hardware
|
||||
<alpha-system-requirements>` outlined in the Alpha release notes.
|
||||
<alpha-2-system-requirements>` outlined in the Alpha 2 release notes.
|
||||
|
||||
.. important::
|
||||
|
||||
Upgrades and downgrades are not supported. You must uninstall any existing
|
||||
ROCm installation before installing the Alpha build.
|
||||
ROCm installation before installing the Alpha 2 build.
|
||||
|
||||
.. grid:: 2
|
||||
|
||||
.. grid-item-card:: Install ROCm
|
||||
|
||||
See :doc:`Install the ROCm 7.0 Alpha via package manager <rocm>`.
|
||||
See :doc:`Install the ROCm 7.0 Alpha 2 via package manager <rocm>`.
|
||||
|
||||
.. grid-item-card:: Install Instinct Driver
|
||||
|
||||
|
||||
@@ -81,7 +81,7 @@ Register ROCm repositories
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/30.10_alpha/ubuntu jammy main" \
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/30.10_alpha2/ubuntu jammy main" \
|
||||
| sudo tee /etc/apt/sources.list.d/amdgpu.list
|
||||
sudo apt update
|
||||
|
||||
@@ -104,7 +104,7 @@ Register ROCm repositories
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/30.10_alpha/ubuntu noble main" \
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/30.10_alpha2/ubuntu noble main" \
|
||||
| sudo tee /etc/apt/sources.list.d/amdgpu.list
|
||||
sudo apt update
|
||||
|
||||
@@ -116,7 +116,7 @@ Register ROCm repositories
|
||||
sudo tee /etc/yum.repos.d/amdgpu.repo <<EOF
|
||||
[amdgpu]
|
||||
name=amdgpu
|
||||
baseurl=https://repo.radeon.com/amdgpu/30.10_alpha/rhel/9.6/main/x86_64/
|
||||
baseurl=https://repo.radeon.com/amdgpu/30.10_alpha2/rhel/9.6/main/x86_64/
|
||||
enabled=1
|
||||
priority=50
|
||||
gpgcheck=1
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
**********************************************
|
||||
Install the ROCm 7.0 Alpha via package manager
|
||||
**********************************************
|
||||
************************************************
|
||||
Install the ROCm 7.0 Alpha 2 via package manager
|
||||
************************************************
|
||||
|
||||
This page describes how to install the ROCm 7.0 Alpha build using ``apt`` on
|
||||
This page describes how to install the ROCm 7.0 Alpha 2 build using ``apt`` on
|
||||
Ubuntu 22.04 or 24.04, or ``dnf`` on Red Hat Enterprise Linux 9.6.
|
||||
|
||||
.. important::
|
||||
@@ -115,10 +115,10 @@ Register ROCm repositories
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/7.0_alpha jammy main" \
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/7.0_alpha2 jammy main" \
|
||||
| sudo tee /etc/apt/sources.list.d/rocm.list
|
||||
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/graphics/7.0_alpha/ubuntu jammy main" \
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/graphics/7.0_alpha2/ubuntu jammy main" \
|
||||
| sudo tee /etc/apt/sources.list.d/rocm-graphics.list
|
||||
|
||||
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
|
||||
@@ -144,10 +144,10 @@ Register ROCm repositories
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/7.0_alpha noble main" \
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/7.0_alpha2 noble main" \
|
||||
| sudo tee /etc/apt/sources.list.d/rocm.list
|
||||
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/graphics/7.0_alpha/ubuntu noble main" \
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/graphics/7.0_alpha2/ubuntu noble main" \
|
||||
| sudo tee /etc/apt/sources.list.d/rocm-graphics.list
|
||||
|
||||
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
|
||||
@@ -162,7 +162,7 @@ Register ROCm repositories
|
||||
sudo tee /etc/yum.repos.d/rocm.repo <<EOF
|
||||
[ROCm-7.0.0]
|
||||
name=ROCm7.0.0
|
||||
baseurl=https://repo.radeon.com/rocm/el9/7.0_alpha/main
|
||||
baseurl=https://repo.radeon.com/rocm/el9/7.0_alpha2/main
|
||||
enabled=1
|
||||
priority=50
|
||||
gpgcheck=1
|
||||
@@ -172,7 +172,7 @@ Register ROCm repositories
|
||||
sudo tee /etc/yum.repos.d/rocm-graphics.repo <<EOF
|
||||
[ROCm-7.0.0-Graphics]
|
||||
name=ROCm7.0.0-Graphics
|
||||
baseurl=https://repo.radeon.com/graphics/7.0_alpha/rhel/9/main/x86_64/
|
||||
baseurl=https://repo.radeon.com/graphics/7.0_alpha2/rhel/9/main/x86_64/
|
||||
enabled=1
|
||||
priority=50
|
||||
gpgcheck=1
|
||||
|
||||
@@ -1,106 +1,56 @@
|
||||
****************************
|
||||
ROCm 7.0 Alpha release notes
|
||||
****************************
|
||||
******************************
|
||||
ROCm 7.0 Alpha 2 release notes
|
||||
******************************
|
||||
|
||||
The ROCm 7.0 Alpha is an early look into the upcoming ROCm 7.0 major release,
|
||||
which introduces functional support for AMD Instinct™ MI355X and MI350X
|
||||
on bare metal, single node systems. It also includes new features for
|
||||
The ROCm 7.0 Alpha 2 is a preview of the upcoming ROCm 7.0 release,
|
||||
which includes functional support for AMD Instinct™ MI355X and MI350X
|
||||
on bare metal, single-node systems. It also introduces new ROCm features for
|
||||
MI300X, MI200, and MI100 series accelerators. This is an Alpha-quality release;
|
||||
expect issues and limitations that will be addressed in upcoming previews.
|
||||
|
||||
.. important::
|
||||
|
||||
This Alpha release is not intended for performance evaluation.
|
||||
The Alpha 2 release is not intended for performance evaluation.
|
||||
For the latest stable release for use in production, see the [ROCm documentation](https://rocm.docs.amd.com/en/latest/).
|
||||
|
||||
This page provides a high-level summary of supported systems, key changes to the ROCm software
|
||||
stack, developments related to AI frameworks, current known limitations, and installation
|
||||
information.
|
||||
This page provides a high-level summary of key changes added to the Alpha 2
|
||||
release since `the previous Alpha
|
||||
<https://rocm.docs.amd.com/en/docs-7.0-alpha/preview/index.html>`_.
|
||||
|
||||
.. _alpha-system-requirements:
|
||||
.. _alpha-2-system-requirements:
|
||||
|
||||
Operating system and hardware support
|
||||
=====================================
|
||||
|
||||
Only the accelerators and operating systems listed here are supported. Multi-node systems,
|
||||
virtualized environments, and GPU partitioning are not supported in this Alpha.
|
||||
virtualized environments, and GPU partitioning are not supported in the Alpha 2 release.
|
||||
|
||||
* AMD accelerator: Instinct MI355X, MI350X, MI325X [#mi325x]_, MI300X, MI300A, MI250X, MI250, MI210, MI100
|
||||
* Operating system: Ubuntu 22.04, Ubuntu 24.04, or RHEL 9.6
|
||||
* AMD Instinct accelerator: MI355X, MI350X, MI325X [#mi325x]_, MI300X, MI300A, MI250X, MI250, MI210, MI100
|
||||
* Operating system: Ubuntu 22.04, Ubuntu 24.04, RHEL 9.6
|
||||
* System type: Bare metal, single node only
|
||||
* Partitioning: Not supported
|
||||
|
||||
.. [#mi325x] MI325X is only supported with Ubuntu 22.04.
|
||||
|
||||
.. _alpha-highlights:
|
||||
.. _alpha-2-highlights:
|
||||
|
||||
Alpha release highlights
|
||||
========================
|
||||
Alpha 2 release highlights
|
||||
==========================
|
||||
|
||||
This section highlights key features enabled in the ROCm 7.0 Alpha.
|
||||
This section highlights key features enabled in the ROCm 7.0 Alpha 2 release.
|
||||
|
||||
AI frameworks
|
||||
-------------
|
||||
|
||||
PyTorch
|
||||
~~~~~~~
|
||||
|
||||
The ROCm 7.0 Alpha enables the following PyTorch features:
|
||||
|
||||
* Support for PyTorch 2.7
|
||||
|
||||
* Integrated Fused Rope kernels in APEX
|
||||
|
||||
* Compilation of Python C++ extensions using amdclang++
|
||||
|
||||
* Support for channels-last NHWC format for convolutions via MIOpen
|
||||
|
||||
TensorFlow
|
||||
~~~~~~~~~~
|
||||
|
||||
This Alpha enables support for TensorFlow 2.19.
|
||||
|
||||
vLLM
|
||||
~~~~
|
||||
|
||||
* Support for Open Compute Project (OCP) ``FP8`` data type
|
||||
|
||||
* ``FP4`` precision for Llama 3.1 405B
|
||||
The ROCm 7.0 Alpha 2 release supports PyTorch 2.7, TensorFlow 2.19, and Triton 3.3.0.
|
||||
|
||||
Libraries
|
||||
---------
|
||||
|
||||
.. _alpha-new-data-type-support:
|
||||
MIGraphX
|
||||
~~~~~~~~
|
||||
|
||||
New data type support
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
MX-compliant data types bring microscaling support to ROCm. For more information, see the `OCP
|
||||
Microscaling (MX) Formats Specification
|
||||
<https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf>`_. The ROCm
|
||||
7.0 Alpha enables functional support for MX data types ``FP4``, ``FP6``, and ``FP8`` on MI355X
|
||||
systems in these ROCm libraries:
|
||||
|
||||
* Composable Kernel (``FP4`` and ``FP8`` only)
|
||||
|
||||
* hipBLASLt
|
||||
|
||||
* MIGraphX (``FP4`` only)
|
||||
|
||||
The following libraries are updated to support the Open Compute Project (OCP) floating-point ``FP8``
|
||||
format on MI355X instead of the NANOO ``FP8`` format:
|
||||
|
||||
* Composable Kernel
|
||||
|
||||
* hipBLASLt
|
||||
|
||||
* hipSPARSELt
|
||||
|
||||
* MIGraphX
|
||||
|
||||
* rocWMMA
|
||||
|
||||
MIGraphX now also supports ``BF16``.
|
||||
Added support for the Open Compute Project (OCP) ``FP8`` data type on MI350X accelerators.
|
||||
|
||||
RCCL support
|
||||
~~~~~~~~~~~~
|
||||
@@ -108,153 +58,101 @@ RCCL support
|
||||
RCCL is supported for single-node functional usage only. Multi-node communication capabilities will
|
||||
be supported in future preview releases.
|
||||
|
||||
MIGraphX
|
||||
~~~~~~~~
|
||||
|
||||
* Support for OCP ``FP8`` and MX ``FP4`` data types on MI355X
|
||||
|
||||
* Support for ``BF16`` on all hardware
|
||||
|
||||
* Support for PyTorch 2.7 via Torch-MIGraphX
|
||||
|
||||
Tools
|
||||
-----
|
||||
|
||||
AMD SMI
|
||||
~~~~~~~
|
||||
|
||||
* The default output of the ``amd-smi`` CLI now displays a simple table view.
|
||||
|
||||
* New APIs: CPU affinity shows GPUs' affinitization to each CPU in a system.
|
||||
|
||||
ROCgdb
|
||||
~~~~~~
|
||||
|
||||
* MX data types support: ``FP4``, ``FP6``, and ``FP8``
|
||||
|
||||
ROCprof Compute Viewer
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Initial release: ``rocprof-compute-viewer`` allows the visualization of ``rocprofv3``'s thread
|
||||
trace output
|
||||
|
||||
ROCprof Trace Decoder
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Initial release: ``rocprof-trace-decoder`` a plugin API for decoding thread traces
|
||||
|
||||
ROCm Compute Profiler
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* MX data types support: ``FP4``, ``FP6``, and ``FP8``
|
||||
|
||||
* MI355X and MI350X performance counters: CPC, SPI, SQ, TA/TD/TCP, and TCC
|
||||
|
||||
* Enhanced roofline analysis with support for ``INT8``, ``INT32``, ``FP8``, ``FP16``, and ``BF16``
|
||||
data types
|
||||
|
||||
* Roofline distinction for ``FP32`` and ``FP64`` data types
|
||||
|
||||
* Selective kernel profiling
|
||||
|
||||
ROCm Systems Profiler
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Trace support for computer vision APIs: H264, H265, AV1, VP9, and JPEG
|
||||
|
||||
* Trace support for computer vision engine activity
|
||||
|
||||
* OpenMP for C++ language and kernel activity support
|
||||
|
||||
ROCm Validation Suite
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* MI355X and MI350X accelerator support in the IET (Integrated Execution Test), GST (GPU Stress Test), and Babel (memory bandwidth test) modules.
|
||||
|
||||
ROCprofiler-SDK
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
* Program counter (PC) sampling (host trap-based)
|
||||
|
||||
* API for profiling applications using thread traces (beta)
|
||||
|
||||
* Support in ``rocprofv3`` CLI tool for thread trace service
|
||||
|
||||
HIP
|
||||
---
|
||||
|
||||
The HIP runtime includes support for:
|
||||
|
||||
* Open Compute Project (OCP) MX floating-point ``FP4``, ``FP6``, and ``FP8`` data types and APIs
|
||||
* Added ``constexpr`` operators for ``FP16`` and ``BF16``.
|
||||
|
||||
* Improved logging by adding more precise pointer information and launch arguments for better
|
||||
tracking and debugging in dispatch methods
|
||||
* Added ``__syncwarp`` operation.
|
||||
|
||||
In addition, the HIP runtime includes the following functional improvements which improve runtime
|
||||
* The ``_sync()`` version of crosslane builtins such as ``shfl_sync()`` and
|
||||
``__reduce_add_sync`` are enabled by default. These can be disabled by
|
||||
setting the preprocessor macro ``HIP_DISABLE_WARP_SYNC_BUILTINS``.
|
||||
|
||||
In addition, the HIP runtime includes the following functional enhancements which improve runtime
|
||||
performance and user experience:
|
||||
|
||||
* Optimized HIP runtime lock contention in some events and kernel handling APIs. Event processing
|
||||
and memory object look-ups now use the shared mutex implementation. Kernel object look-up during
|
||||
C++ kernel launch can now avoid a global lock. These changes improve performance in certain
|
||||
applications with high usage, particularly for multiple GPUs, multiple threads, and HIP streams
|
||||
per GPU.
|
||||
* HIP runtime now enables peer-to-peer (P2P) memory copies to utilize all
|
||||
available SDMA engines, rather than being limited to a single engine. It also
|
||||
selects the best engine first to give optimal bandwidth.
|
||||
|
||||
* Programmatic support for scratch buffer limit on GPU device. Developers can now change the default
|
||||
allocation size with the expected scratch limit.
|
||||
* To match CUDA runtime behavior more closely, HIP runtime APIs no longer check
|
||||
the stream validity with streams passed as input parameters. If the input
|
||||
stream is invalid, it causes a segmentation fault instead of returning
|
||||
an error code ``hipErrorContextIsDestroyed``.
|
||||
|
||||
* Unified managed buffer and kernel argument buffers so the HIP runtime no longer needs to create
|
||||
and load a separate kernel argument buffer.
|
||||
The following issues have been resolved:
|
||||
|
||||
* Refactored memory validation to create a unique function to validate a variety of memory copy
|
||||
operations.
|
||||
* An issue when retrieving a memory object from the IPC memory handle causing
|
||||
failures in some framework test applications.
|
||||
|
||||
* Shader names are now demangled for more readable kernel logs
|
||||
* An issue causing the incorrect return error ``hipErrorNoDevice`` when a crash occurred
|
||||
on a GPU due to an illegal operation or memory violation. The HIP runtime now
|
||||
handles the failure on the GPU side properly and reports the precise error
|
||||
code based on the last error seen on the GPU.
|
||||
|
||||
See :ref:`HIP compatibility <hip-known-limitation>`.
|
||||
See :ref:`HIP compatibility <hip-known-limitation>` for more information about upcoming API changes.
|
||||
|
||||
Compilers
|
||||
---------
|
||||
|
||||
* The compiler driver now uses parallel code generation by default when compiling using full LTO
|
||||
(including when using the ``-fgpu-rdc`` option) for HIP. This divides the optimized LLVM IR module
|
||||
into roughly equal partitions before instruction selection and lowering, which can help improve
|
||||
build times.
|
||||
The Alpha 2 release introduces the AMD Next-Gen Fortran compiler. ``llvm-flang``
|
||||
(sometimes called ``new-flang`` or ``flang-18``) is a re-implementation of the
|
||||
Fortran frontend. It is a strategic replacement for ``classic-flang`` and is
|
||||
developed in LLVM's upstream repo at `<https://github.com/llvm/llvm-project/tree/main/flang>`__.
|
||||
|
||||
Each kernel in the linked LTO module may be put in a separate partition, and any non-inlined
|
||||
function it depends on may be copied alongside it. Thus, while parallel code generation can
|
||||
improve build time, it can duplicate non-inlined, non-kernel functions across multiple partitions,
|
||||
potentially increasing the binary size of the final object file.
|
||||
Key enhancements include:
|
||||
|
||||
* Compiler option ``-flto-partitions=<num>``.
|
||||
* Compiler:
|
||||
|
||||
Equivalent to the ``--lto-partitions=<num>`` LLD option. Controls the number of partitions used for
|
||||
parallel code generation when using full LTO (including when using ``-fgpu-rdc``). The number of
|
||||
partitions must be greater than 0, and a value of 1 disables the feature. The default value is 8.
|
||||
* Improved memory load and store instructions.
|
||||
|
||||
Developers are encouraged to experiment with different numbers of partitions using the
|
||||
``-flto-partitions`` Clang command line option. Recommended values are 1 to 16 partitions, with
|
||||
especially large projects containing many kernels potentially benefitting from up to 64
|
||||
partitions. It is not recommended to use a value greater than the number of threads on the
|
||||
machine. Smaller projects, or projects that contain only a few kernels may also not benefit at
|
||||
all from partitioning and may even see a slight increase in build time due to the small overhead
|
||||
of analyzing and partitioning the modules.
|
||||
* Updated clang/llvm to `AMD clang version 20.0.0git` (equivalent to LLVM 20.0.0 with additional out-of-tree patches).
|
||||
|
||||
* HIPIFY now supports NVIDIA CUDA 12.8.0 APIs. See
|
||||
`<https://github.com/ROCm/HIPIFY/blob/amd-develop/docs/reference/supported_apis.md>`_ for more
|
||||
information.
|
||||
* Support added for separate debug file generation for device code.
|
||||
|
||||
* Comgr:
|
||||
|
||||
* Added support for an in-memory virtual file system (VFS) for storing temporary files
|
||||
generated during intermediate compilation steps. This is designed to
|
||||
improve performance by reducing on-disk file I/O. Currently, VFS is
|
||||
supported only for the device library link step, with plans for expanded
|
||||
support in future releases.
|
||||
|
||||
* SPIR-V:
|
||||
|
||||
* Improved `target-specific extensions <https://github.com/ROCm/llvm-project/blob/c2535466c6e40acd5ecf6ba1676a4e069c6245cc/clang/docs/LanguageExtensions.rst>`_:
|
||||
|
||||
* Added a new target-specific builtin ``__builtin_amdgcn_processor_is`` for late or deferred queries of the current target processor.
|
||||
|
||||
* Added a new target-specific builtin ``__builtin_amdgcn_is_invocable``, enabling fine-grained, per-builtin feature availability.
|
||||
|
||||
* HIPIFY now supports NVIDIA CUDA 12.8.0 APIs:
|
||||
|
||||
* Added support for all new device and host APIs, including ``FP4``, ``FP6``, and ``FP128`` -- including support for the corresponding ROCm HIP equivalents.
|
||||
|
||||
* Deprecated features:
|
||||
|
||||
* ROCm components no longer use the ``__AMDGCN_WAVEFRONT_SIZE`` and
|
||||
``__AMDGCN_WAVEFRONT_SIZE__`` macros nor HIP's ``warpSize`` variable as
|
||||
``constexpr``s. These macros and reliance on ``warpSize`` as a ``constexpr`` are
|
||||
deprecated and will be disabled in a future release. Users are encouraged
|
||||
to update their code if needed to ensure future compatibility.
|
||||
|
||||
Instinct Driver / ROCm packaging separation
|
||||
-------------------------------------------
|
||||
|
||||
The Instinct Driver is now distributed separately from the ROCm software stack -- it is now stored
|
||||
The Instinct Driver is now distributed separately from the ROCm software stack and is now stored
|
||||
in its own location in the package repository at `repo.radeon.com <https://repo.radeon.com/amdgpu/>`_ under ``/amdgpu/``.
|
||||
The first release is designated as Instinct Driver version 30.10. See `ROCm Gets Modular: Meet the
|
||||
Instinct Datacenter GPU Driver
|
||||
<https://rocm.blogs.amd.com/ecosystems-and-partners/instinct-gpu-driver/README.html>`_ for more
|
||||
information.
|
||||
|
||||
Forward and backward compatibility between the Instinct Driver and ROCm are not supported in this
|
||||
Alpha release. See the :doc:`installation instructions <install/index>`.
|
||||
Forward and backward compatibility between the Instinct Driver and ROCm is not supported in the
|
||||
Alpha 2 release. See the :doc:`installation instructions <install/index>`.
|
||||
|
||||
Known limitations
|
||||
=================
|
||||
@@ -264,7 +162,7 @@ Known limitations
|
||||
HIP compatibility
|
||||
-----------------
|
||||
|
||||
HIP runtime APIs in the ROCm 7.0 Alpha do not include the upcoming backward-incompatible changes. See `HIP 7.0 Is
|
||||
HIP runtime APIs in the ROCm 7.0 Alpha 2 don't include the upcoming backward-incompatible changes. See `HIP 7.0 Is
|
||||
Coming: What You Need to Know to Stay Ahead
|
||||
<https://rocm.blogs.amd.com/ecosystems-and-partners/transition-to-hip-7.0-blog/README.html>`_ to learn about the
|
||||
changes expected for HIP.
|
||||
|
||||
@@ -7,7 +7,7 @@ root: preview/index
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: preview/release.rst
|
||||
title: Alpha release notes
|
||||
title: Alpha 2 release notes
|
||||
- file: preview/install/index.rst
|
||||
title: Installation
|
||||
subtrees:
|
||||
@@ -16,213 +16,3 @@ subtrees:
|
||||
title: Install ROCm
|
||||
- file: preview/install/instinct-driver
|
||||
title: Install Instinct Driver
|
||||
- file: preview/benchmark-docker/index.rst
|
||||
title: Docker images for AI
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: preview/benchmark-docker/training.rst
|
||||
title: Training benchmarking
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: preview/benchmark-docker/pre-training-megatron-lm-llama-3-8b.rst
|
||||
title: Pre-train Llama 3 8B with Megatron LM
|
||||
- file: preview/benchmark-docker/pre-training-torchtitan-llama-3-70b.rst
|
||||
title: Pre-train Llama 3 70B with torchtitan
|
||||
- file: preview/benchmark-docker/inference.rst
|
||||
title: Inference benchmarking
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: preview/benchmark-docker/inference-sglang-deepseek-r1-fp4.rst
|
||||
title: Inference DeepSeek R1 FP4 with SGLang
|
||||
- file: preview/benchmark-docker/inference-vllm-llama-3.1-405b-fp4.rst
|
||||
title: Inference Llama 3.1 405B FP4 with vLLM
|
||||
# - entries:
|
||||
# - file: what-is-rocm.rst
|
||||
# - file: about/release-notes.md
|
||||
# title: Release notes
|
||||
# - file: compatibility/compatibility-matrix.rst
|
||||
# title: Compatibility matrix
|
||||
# entries:
|
||||
# - url: https://rocm.docs.amd.com/projects/install-on-linux-internal/en/latest/reference/system-requirements.html
|
||||
# title: Linux system requirements
|
||||
# - url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/reference/system-requirements.html
|
||||
# title: Windows system requirements
|
||||
#
|
||||
# - caption: Install
|
||||
# entries:
|
||||
# - url: https://rocm.docs.amd.com/projects/install-on-linux-internal/en/latest/
|
||||
# title: ROCm on Linux
|
||||
# - url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/
|
||||
# title: HIP SDK on Windows
|
||||
# - url: https://rocm.docs.amd.com/projects/radeon/en/latest/index.html
|
||||
# title: ROCm on Radeon GPUs
|
||||
# - file: how-to/deep-learning-rocm.md
|
||||
# title: Deep learning frameworks
|
||||
# - file: how-to/build-rocm.rst
|
||||
# title: Build ROCm from source
|
||||
#
|
||||
# - caption: How to
|
||||
# entries:
|
||||
# - file: how-to/rocm-for-ai/index.rst
|
||||
# title: Use ROCm for AI
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - file: how-to/rocm-for-ai/install.rst
|
||||
# title: Installation
|
||||
# - file: how-to/rocm-for-ai/system-health-check.rst
|
||||
# title: System health benchmarks
|
||||
# - file: how-to/rocm-for-ai/training/index.rst
|
||||
# title: Training
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
|
||||
# title: Train a model with Megatron-LM
|
||||
# - file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
|
||||
# title: Train a model with PyTorch
|
||||
# - file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
|
||||
# title: Train a model with JAX MaxText
|
||||
# - file: how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry
|
||||
# title: Train a model with LLM Foundry
|
||||
# - file: how-to/rocm-for-ai/training/scale-model-training.rst
|
||||
# title: Scale model training
|
||||
#
|
||||
# - file: how-to/rocm-for-ai/fine-tuning/index.rst
|
||||
# title: Fine-tuning LLMs
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - file: how-to/rocm-for-ai/fine-tuning/overview.rst
|
||||
# title: Conceptual overview
|
||||
# - file: how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference.rst
|
||||
# title: Fine-tuning
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - file: how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference.rst
|
||||
# title: Use a single accelerator
|
||||
# - file: how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference.rst
|
||||
# title: Use multiple accelerators
|
||||
#
|
||||
# - file: how-to/rocm-for-ai/inference/index.rst
|
||||
# title: Inference
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - file: how-to/rocm-for-ai/inference/hugging-face-models.rst
|
||||
# title: Run models from Hugging Face
|
||||
# - file: how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
|
||||
# title: LLM inference frameworks
|
||||
# - file: how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
|
||||
# title: vLLM inference performance testing
|
||||
# - file: how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
|
||||
# title: PyTorch inference performance testing
|
||||
# - file: how-to/rocm-for-ai/inference/deploy-your-model.rst
|
||||
# title: Deploy your model
|
||||
#
|
||||
# - file: how-to/rocm-for-ai/inference-optimization/index.rst
|
||||
# title: Inference optimization
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - file: how-to/rocm-for-ai/inference-optimization/model-quantization.rst
|
||||
# - file: how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst
|
||||
# - file: how-to/rocm-for-ai/inference-optimization/optimizing-with-composable-kernel.md
|
||||
# title: Optimize with Composable Kernel
|
||||
# - file: how-to/rocm-for-ai/inference-optimization/optimizing-triton-kernel.rst
|
||||
# title: Optimize Triton kernels
|
||||
# - file: how-to/rocm-for-ai/inference-optimization/profiling-and-debugging.rst
|
||||
# title: Profile and debug
|
||||
# - file: how-to/rocm-for-ai/inference-optimization/workload.rst
|
||||
# title: Workload optimization
|
||||
#
|
||||
# - url: https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/
|
||||
# title: AI tutorials
|
||||
#
|
||||
# - file: how-to/rocm-for-hpc/index.rst
|
||||
# title: Use ROCm for HPC
|
||||
# - file: how-to/system-optimization/index.rst
|
||||
# title: System optimization
|
||||
# - file: how-to/gpu-performance/mi300x.rst
|
||||
# title: AMD Instinct MI300X performance guides
|
||||
# - file: how-to/system-debugging.md
|
||||
# - file: conceptual/compiler-topics.md
|
||||
# title: Use advanced compiler features
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - url: https://rocm.docs.amd.com/projects/llvm-project/en/latest/index.html
|
||||
# title: ROCm compiler infrastructure
|
||||
# - url: https://rocm.docs.amd.com/projects/llvm-project/en/latest/conceptual/using-gpu-sanitizer.html
|
||||
# title: Use AddressSanitizer
|
||||
# - url: https://rocm.docs.amd.com/projects/llvm-project/en/latest/conceptual/openmp.html
|
||||
# title: OpenMP support
|
||||
# - file: how-to/setting-cus
|
||||
# title: Set the number of CUs
|
||||
# - file: how-to/Bar-Memory.rst
|
||||
# title: Troubleshoot BAR access limitation
|
||||
# - url: https://github.com/amd/rocm-examples
|
||||
# title: ROCm examples
|
||||
#
|
||||
#
|
||||
# - caption: Conceptual
|
||||
# entries:
|
||||
# - file: conceptual/gpu-arch.md
|
||||
# title: GPU architecture overview
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - file: conceptual/gpu-arch/mi300.md
|
||||
# title: MI300 microarchitecture
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - url: https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
|
||||
# title: AMD Instinct MI300/CDNA3 ISA
|
||||
# - url: https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf
|
||||
# title: White paper
|
||||
# - file: conceptual/gpu-arch/mi300-mi200-performance-counters.rst
|
||||
# title: MI300 and MI200 Performance counter
|
||||
# - file: conceptual/gpu-arch/mi250.md
|
||||
# title: MI250 microarchitecture
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - url: https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf
|
||||
# title: AMD Instinct MI200/CDNA2 ISA
|
||||
# - url: https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf
|
||||
# title: White paper
|
||||
# - file: conceptual/gpu-arch/mi100.md
|
||||
# title: MI100 microarchitecture
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - url: https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf
|
||||
# title: AMD Instinct MI100/CDNA1 ISA
|
||||
# - url: https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna-white-paper.pdf
|
||||
# title: White paper
|
||||
# - file: conceptual/file-reorg.md
|
||||
# title: File structure (Linux FHS)
|
||||
# - file: conceptual/gpu-isolation.md
|
||||
# title: GPU isolation techniques
|
||||
# - file: conceptual/cmake-packages.rst
|
||||
# title: Using CMake
|
||||
# - file: conceptual/ai-pytorch-inception.md
|
||||
# title: Inception v3 with PyTorch
|
||||
#
|
||||
# - caption: Reference
|
||||
# entries:
|
||||
# - file: reference/api-libraries.md
|
||||
# title: ROCm libraries
|
||||
# - file: reference/rocm-tools.md
|
||||
# title: ROCm tools, compilers, and runtimes
|
||||
# - file: reference/gpu-arch-specs.rst
|
||||
# - file: reference/gpu-atomics-operation.rst
|
||||
# - file: reference/precision-support.rst
|
||||
# title: Precision support
|
||||
# - file: reference/graph-safe-support.rst
|
||||
# title: Graph safe support
|
||||
#
|
||||
# - caption: Contribute
|
||||
# entries:
|
||||
# - file: contribute/contributing.md
|
||||
# title: Contributing to the ROCm documentation
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - file: contribute/toolchain.md
|
||||
# title: ROCm documentation toolchain
|
||||
# - file: contribute/building.md
|
||||
# - file: contribute/feedback.md
|
||||
# title: Providing feedback about the ROCm documentation
|
||||
# - file: about/license.md
|
||||
# title: ROCm licenses
|
||||
|
||||
@@ -11,5 +11,5 @@ ready(() => {
|
||||
"a.header-all-versions[href='https://rocm.docs.amd.com/en/latest/release/versions.html']",
|
||||
);
|
||||
versionListLink.textContent = "Preview versions"
|
||||
versionListLink.href = "https://rocm.docs.amd.com/en/docs-7.0-alpha/preview/versions.html"
|
||||
versionListLink.href = "https://rocm.docs.amd.com/en/docs-7.0-alpha-2/preview/versions.html"
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user