Docs: Ray release 25.12 and compatibility version format standardization (#5845 ) (#5846 )

Merge pull request #5843 from SwRaw/sw_cherrypick
Cherrypicking amd-smi updates from ROCm internal
2026-01-10 23:28:03 -05:00 · 2026-01-08 12:29:00 -05:00 · 2026-01-08 20:33:14 +05:30 · 2026-01-08 16:46:22 +05:30 · 2026-01-08 16:46:22 +05:30 · 2026-01-08 16:46:22 +05:30
14 changed files with 156 additions and 189 deletions
--- a/.azuredevops/components/HIP.yml
+++ b/.azuredevops/components/HIP.yml
@@ -34,7 +34,6 @@ parameters:
  default:
    - cmake
    - libnuma-dev
-    - libsimde-dev
    - mesa-common-dev
    - ninja-build
    - ocl-icd-libopencl1
--- a/.azuredevops/components/origami.yml
+++ b/.azuredevops/components/origami.yml
@@ -39,7 +39,6 @@ parameters:
    - python3
    - python3-dev
    - python3-pip
-    - python3-venv
    - libgtest-dev
    - libboost-filesystem-dev
    - libboost-program-options-dev
@@ -47,8 +46,6 @@ parameters:
  type: object
  default:
    - nanobind>=2.0.0
-    - pytest
-    - pytest-cov
 - name: rocmDependencies
  type: object
  default:
@@ -75,10 +72,8 @@ parameters:
      - { os: ubuntu2204, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
    testJobs:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      # - { os: ubuntu2204, packageManager: apt, target: gfx1151 }
-      # - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
 - name: downstreamComponentMatrix
  type: object
  default:
@@ -121,11 +116,6 @@ jobs:
      parameters:
        dependencyList:
          - gtest
-    - ${{ if ne(job.os, 'almalinux8') }}:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-        parameters:
-          dependencyList:
-            - catch2
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
@@ -147,7 +137,6 @@ jobs:
          -DORIGAMI_BUILD_SHARED_LIBS=ON
          -DORIGAMI_ENABLE_PYTHON=ON
          -DORIGAMI_BUILD_TESTING=ON
-          -DORIGAMI_ENABLE_FETCH=ON
          -GNinja
    - ${{ if ne(job.os, 'almalinux8') }}:
      - task: PublishPipelineArtifact@1
@@ -180,6 +169,7 @@ jobs:
      dependsOn: origami_build_${{ job.os }}
      condition:
        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
          eq(${{ parameters.aggregatePipeline }}, False)
        )
@@ -190,30 +180,30 @@ jobs:
      workspace:
        clean: all
      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          pipModules: ${{ parameters.pipModules }}
          packageManager: ${{ job.packageManager }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-        parameters:
-          checkoutRepo: ${{ parameters.checkoutRepo }}
-          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-        parameters:
-          dependencyList:
-            - gtest
-      - ${{ if ne(job.os, 'almalinux8') }}:
-        - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-          parameters:
-            dependencyList:
-              - catch2
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
          preTargetFilter: ${{ parameters.componentName }}
          os: ${{ job.os }}
+      - task: DownloadPipelineArtifact@2
+        displayName: 'Download Build Directory Artifact'
+        inputs:
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
+          path: '$(Agent.BuildDirectory)/s/build'
+      - task: DownloadPipelineArtifact@2
+        displayName: 'Download Python Source Artifact'
+        inputs:
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
+          path: '$(Agent.BuildDirectory)/s/python'
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
        parameters:
          checkoutRef: ${{ parameters.checkoutRef }}
@@ -222,72 +212,25 @@ jobs:
          gpuTarget: ${{ job.target }}
          ${{ if parameters.triggerDownstreamJobs }}:
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - task: CMake@1
-        displayName: 'Origami Test CMake Configuration'
-        inputs:
-          cmakeArgs: >-
-            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
-            -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-            -DORIGAMI_BUILD_SHARED_LIBS=ON
-            -DORIGAMI_ENABLE_PYTHON=ON
-            -DORIGAMI_BUILD_TESTING=ON
-            -GNinja
-            $(Agent.BuildDirectory)/s
-      - task: Bash@3
-        displayName: 'Build Origami Tests and Python Bindings'
-        inputs:
-          targetType: inline
-          workingDirectory: build
-          script: |
-            cmake --build . --target origami-tests origami_python -- -j$(nproc)
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      # Run tests using CTest (discovers and runs both C++ and Python tests)
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
          os: ${{ job.os }}
-          testDir: 'build'
-          testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml'
-      # Test pip install workflow
-      # - task: Bash@3
-      #   displayName: 'Test Pip Install'
-      #   inputs:
-      #     targetType: inline
-      #     script: |
-      #       set -e
-            
-      #       echo "==================================================================="
-      #       echo "Testing pip install workflow (pip install -e .)"
-      #       echo "==================================================================="
-            
-      #       # Set environment variables for pip install CMake build
-      #       export ROCM_PATH=$(Agent.BuildDirectory)/rocm
-      #       export CMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm:$(Agent.BuildDirectory)/vendor
-      #       export CMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-            
-      #       echo "ROCM_PATH: $ROCM_PATH"
-      #       echo "CMAKE_PREFIX_PATH: $CMAKE_PREFIX_PATH"
-      #       echo "CMAKE_CXX_COMPILER: $CMAKE_CXX_COMPILER"
-      #       echo ""
-            
-      #       # Install from source directory
-      #       cd "$(Agent.BuildDirectory)/s/python"
-      #       pip install -e .
-            
-      #       # Verify import works
-      #       echo ""
-      #       echo "Verifying origami can be imported..."
-      #       python3 -c "import origami; print('✓ Successfully imported origami')"
-            
-      #       # Run pytest on installed package
-      #       echo ""
-      #       echo "Running pytest tests..."
-      #       python3 -m pytest tests/ -v -m "not slow" --tb=short
-            
-      #       echo ""
-      #       echo "==================================================================="
-      #       echo "Pip install test completed successfully"
-      #       echo "==================================================================="
+          testDir: '$(Agent.BuildDirectory)/rocm/bin'
+          testExecutable: './origami-tests'
+          testParameters: '--yaml origami-tests.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
+      - script: |
+          set -e
+          export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
+
+          echo "--- Running origami_test.py ---"
+          python3 $(Agent.BuildDirectory)/s/python/origami_test.py
+          
+          echo "--- Running origami_grid_test.py ---"
+          python3 $(Agent.BuildDirectory)/s/python/origami_grid_test.py
+        displayName: 'Run Python Binding Tests'
+        condition: succeeded()
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -30,7 +30,6 @@ parameters:
    - python3-pip
    - protobuf-compiler
    - libprotoc-dev
-    - libopencv-dev
 - name: pipModules
  type: object
  default:
@@ -65,7 +64,6 @@ parameters:
    - MIVisionX
    - rocm_smi_lib
    - rccl
-    - rocAL
    - rocALUTION
    - rocBLAS
    - rocDecode
@@ -105,7 +103,6 @@ parameters:
    - MIVisionX
    - rocm_smi_lib
    - rccl
-    - rocAL
    - rocALUTION
    - rocBLAS
    - rocDecode
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -36,6 +36,7 @@ Andrej
 Arb
 Autocast
 autograd
+Backported
 BARs
 BatchNorm
 BLAS
@@ -203,9 +204,11 @@ GenAI
 GenZ
 GitHub
 Gitpod
+hardcoded
 HBM
 HCA
 HGX
+HLO
 HIPCC
 hipDataType
 HIPExtension
@@ -333,6 +336,7 @@ MoEs
 Mooncake
 Mpops
 Multicore
+multihost
 Multithreaded
 mx
 MXFP
@@ -1027,6 +1031,7 @@ uncacheable
 uncorrectable
 underoptimized
 unhandled
+unfused
 uninstallation
 unmapped
 unsqueeze
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -270,26 +270,26 @@ The [ROCm examples repository](https://github.com/ROCm/rocm-examples) has been e
 :margin: auto 0 auto auto
 :::{grid}
 :margin: auto 0 auto auto
-* [hipBLASLt](https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/)
-* [hipSPARSE](https://rocm.docs.amd.com/projects/hipSPARSE/en/latest/)
-* [hipSPARSELt](https://rocm.docs.amd.com/projects/hipSPARSELt/en/latest/)
-* [hipTensor](https://rocm.docs.amd.com/projects/hipTensor/en/latest/)
+* [hipBLASLt](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/hipBLASLt)
+* [hipSPARSE](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/hipSPARSE)
+* [hipSPARSELt](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/hipSPARSELt)
+* [hipTensor](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/hipTensor)
 :::
 :::{grid}
 :margin: auto 0 auto auto
-* [rocALUTION](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/)
-* [ROCprofiler-SDK](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/)
-* [rocWMMA](https://rocm.docs.amd.com/projects/rocWMMA/en/latest/)
+* [rocALUTION](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/rocALUTION)
+* [ROCprofiler-SDK](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/rocProfiler-SDK)
+* [rocWMMA](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/rocWMMA)
 :::
 ::::

 Usage examples are now available for the following performance analysis tools:

-* [ROCm Compute Profiler](https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/index.html)
-* [ROCm Systems Profiler](https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/index.html)
-* [rocprofv3](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/how-to/using-rocprofv3.html)
+* [ROCm Compute Profiler](https://github.com/ROCm/rocm-examples/tree/amd-staging/Tools/rocprof-compute)
+* [ROCm Systems Profiler](https://github.com/ROCm/rocm-examples/tree/amd-staging/Tools/rocprof-systems)
+* [rocprofv3](https://github.com/ROCm/rocm-examples/tree/amd-staging/Tools/rocprofv3)

-The complete source code for the [HIP Graph Tutorial](https://rocm.docs.amd.com/projects/HIP/en/latest/tutorial/graph_api.html) is also available as part of the ROCm examples.
+The complete source code for the [HIP Graph Tutorial](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Tutorials/graph_api) is also available as part of the ROCm examples.

 ### ROCm documentation updates

--- a/docs/compatibility/ml-compatibility/jax-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst
@@ -269,6 +269,33 @@ For a complete and up-to-date list of JAX public modules (for example, ``jax.num
  JAX API modules are maintained by the JAX project and is subject to change.
  Refer to the official Jax documentation for the most up-to-date information.

+Key features and enhancements for ROCm 7.1
+===============================================================================
+
+- Enabled compilation of multihost HLO runner Python bindings.
+
+  - Backported multihost HLO runner bindings and some related changes to
+    :code:`FunctionalHloRunner`.
+
+  - Added :code:`requirements_lock_3_12` to enable building for Python 3.12.
+
+- Removed hardcoded NHWC convolution layout for ``fp16`` precision to address the performance drops for ``fp16`` precision on gfx12xx GPUs.
+
+
+- ROCprofiler-SDK integration:
+
+  - Integrated ROCprofiler-SDK (v3) to XLA to improve profiling of GPU events,
+    support both time-based and step-based profiling.
+
+  - Added unit tests for :code:`rocm_collector` and :code:`rocm_tracer`.
+
+- Added Triton unsupported conversion from ``f8E4M3FNUZ`` to ``fp16`` with
+  rounding mode.
+
+- Introduced :code:`CudnnFusedConvDecomposer` to revert fused convolutions
+  when :code:`ConvAlgorithmPicker` fails to find a fused algorithm, and removed
+  unfused fallback paths from :code:`RocmFusedConvRunner`.
+
 Key features and enhancements for ROCm 7.0
 ===============================================================================

--- a/docs/conf.py
+++ b/docs/conf.py
@@ -268,6 +268,3 @@ html_context = {
    "granularity_type" : [('Coarse-grained', 'coarse-grained'), ('Fine-grained', 'fine-grained')],
    "scope_type" : [('Device', 'device'), ('System', 'system')]
 }
-
-# Disable figure and table numbering
-numfig = False
--- a/docs/how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference.rst
@@ -44,7 +44,7 @@ Setting up the base implementation environment

   .. code-block:: shell

-      rocm-smi --showproductname
+      amd-smi static --board

 #. Check that your GPUs are available to PyTorch.

@@ -65,8 +65,8 @@ Setting up the base implementation environment

 .. tip::

-   During training and inference, you can check the memory usage by running the ``rocm-smi`` command in your terminal.
-   This tool helps you see shows which GPUs are involved.
+   During training and inference, you can check the memory usage by running the ``amd-smi`` command in your terminal.
+   This tool helps you see which GPUs are involved.


 .. _fine-tuning-llms-multi-gpu-hugging-face-accelerate:
@@ -91,10 +91,10 @@ Now, it's important to adjust how you load the model. Add the ``device_map`` par

   ...
   base_model_name = "meta-llama/Llama-2-7b-chat-hf"
-   
+
   # Load base model to GPU memory
   base_model = AutoModelForCausalLM.from_pretrained(
-           base_model_name, 
+           base_model_name,
           device_map = "auto",
           trust_remote_code = True)
   ...
@@ -130,7 +130,7 @@ After loading the model in this way, the model is fully ready to use the resourc
 torchtune for fine-tuning and inference
 =============================================

-`torchtune <https://meta-pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-GPU 
+`torchtune <https://pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-GPU
 model fine-tuning and inference with LLMs.

 #. Install torchtune using pip.
@@ -139,7 +139,7 @@ model fine-tuning and inference with LLMs.

      # Install torchtune with PyTorch release 2.2.2+
      pip install torchtune
-      
+
      # To confirm that the package is installed correctly
      tune --help

@@ -148,12 +148,12 @@ model fine-tuning and inference with LLMs.
   .. code-block:: shell

      usage: tune [-h] {download,ls,cp,run,validate} ...
-      
+
      Welcome to the TorchTune CLI!
-      
+
      options:
        -h, --help            show this help message and exit
-      
+
      subcommands:
        {download,ls,cp,run,validate}

@@ -194,11 +194,11 @@ model fine-tuning and inference with LLMs.
        apply_lora_to_output: False
        lora_rank: 8
        lora_alpha: 16
-      
+
      tokenizer:
        _component_: torchtune.models.llama2.llama2_tokenizer
        path: /tmp/Llama-2-7b-hf/tokenizer.model
-      
+
      # Dataset and sampler
      dataset:
        _component_: torchtune.datasets.alpaca_cleaned_dataset
--- a/docs/how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference.rst
@@ -44,20 +44,19 @@ Setting up the base implementation environment

   .. code-block:: shell

-      rocm-smi --showproductname
+      amd-smi static --board

   Your output should look like this:

   .. code-block:: shell

-      ============================ ROCm System Management Interface ============================
-      ====================================== Product Info ======================================
-      GPU[0]          : Card Series:          AMD Instinct MI300X OAM
-      GPU[0]          : Card model:           0x74a1
-      GPU[0]          : Card vendor:          Advanced Micro Devices, Inc. [AMD/ATI]
-      GPU[0]          : Card SKU:             MI3SRIOV
-      ==========================================================================================
-      ================================== End of ROCm SMI Log ===================================
+      GPU: 0
+         BOARD:
+            MODEL_NUMBER: 102-G39203-0B
+            PRODUCT_SERIAL: PCB079220-1150
+            FRU_ID: 113-AMDG392030B04-100-300000097H
+            PRODUCT_NAME: AMD Instinct MI325 OAM
+            MANUFACTURER_NAME: AMD

 #. Check that your GPUs are available to PyTorch.

@@ -94,13 +93,13 @@ Setting up the base implementation environment
      pip install -r requirements-dev.txt
      cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S .
      python setup.py install
-      
+
      # To leverage the SFTTrainer in TRL for model fine-tuning.
      pip install trl
-      
+
      # To leverage PEFT for efficiently adapting pre-trained language models .
      pip install peft
-      
+
      # Install the other dependencies.
      pip install transformers datasets huggingface-hub scipy

@@ -132,7 +131,7 @@ Download the base model and fine-tuning dataset

   .. note::

-      You can also use the `NousResearch Llama-2-7b-chat-hf <https://huggingface.co/NousResearch/Llama-2-7b-chat-hf>`_ 
+      You can also use the `NousResearch Llama-2-7b-chat-hf <https://huggingface.co/NousResearch/Llama-2-7b-chat-hf>`_
      as a substitute. It has the same model weights as the original.

 #. Run the following code to load the base model and tokenizer.
@@ -141,14 +140,14 @@ Download the base model and fine-tuning dataset

      # Base model and tokenizer names.
      base_model_name = "meta-llama/Llama-2-7b-chat-hf"
-      
+
      # Load base model to GPU memory.
      device = "cuda:0"
      base_model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code = True).to(device)
-      
+
      # Load tokenizer.
      tokenizer = AutoTokenizer.from_pretrained(
-              base_model_name, 
+              base_model_name,
              trust_remote_code = True)
      tokenizer.pad_token = tokenizer.eos_token
      tokenizer.padding_side = "right"
@@ -162,10 +161,10 @@ Download the base model and fine-tuning dataset
      # Dataset for fine-tuning.
      training_dataset_name = "mlabonne/guanaco-llama2-1k"
      training_dataset = load_dataset(training_dataset_name, split = "train")
-      
+
      # Check the data.
      print(training_dataset)
-      
+
      # Dataset 11 is a QA sample in English.
      print(training_dataset[11])

@@ -252,8 +251,8 @@ Compare the number of trainable parameters and training time under the two diffe
                    dataset_text_field = "text",
                    tokenizer = tokenizer,
                    args = training_arguments
-            ) 
-            
+            )
+
            # Run the trainer.
            sft_trainer.train()

@@ -286,7 +285,7 @@ Compare the number of trainable parameters and training time under the two diffe
                    if param.requires_grad:
                        trainable_params += param.numel()
                print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}")
-            
+
            sft_trainer.peft_config = None
            print_trainable_parameters(sft_trainer.model)

@@ -309,8 +308,8 @@ Compare the number of trainable parameters and training time under the two diffe
                    dataset_text_field = "text",
                    tokenizer = tokenizer,
                    args = training_arguments
-            ) 
-            
+            )
+
            # Training.
            trainer_full.train()

@@ -349,7 +348,7 @@ store, and load.

         # PEFT adapter name.
         adapter_name = "llama-2-7b-enhanced-adapter"
-         
+
         # Save PEFT adapter.
         sft_trainer.model.save_pretrained(adapter_name)

@@ -359,21 +358,21 @@ store, and load.

         # Access adapter directory.
         cd llama-2-7b-enhanced-adapter
-         
+
         # List all adapter files.
         README.md  adapter_config.json  adapter_model.safetensors

   .. tab-item:: Saving a fully fine-tuned model
      :sync: without

-      If you're not using LoRA and PEFT so there is no PEFT LoRA configuration used for training, use the following code 
+      If you're not using LoRA and PEFT so there is no PEFT LoRA configuration used for training, use the following code
      to save your fine-tuned model to your system.

      .. code-block:: python

         # Fully fine-tuned model name.
         new_model_name = "llama-2-7b-enhanced"
-         
+
         # Save the fully fine-tuned model.
         full_trainer.model.save_pretrained(new_model_name)

@@ -383,7 +382,7 @@ store, and load.

         # Access new model directory.
         cd llama-2-7b-enhanced
-         
+
         # List all model files.
         config.json                       model-00002-of-00006.safetensors  model-00005-of-00006.safetensors
         generation_config.json            model-00003-of-00006.safetensors  model-00006-of-00006.safetensors
@@ -412,26 +411,26 @@ Let's look at achieving model inference using these types of models.

   .. tab-item:: Inference using PEFT adapters

-      To use PEFT adapters like a normal transformer model, you can run the generation by loading a base model along with PEFT 
+      To use PEFT adapters like a normal transformer model, you can run the generation by loading a base model along with PEFT
      adapters as follows.

      .. code-block:: python

         from peft import PeftModel
         from transformers import AutoModelForCausalLM
-         
+
         # Set the path of the model or the name on Hugging face hub
         base_model_name = "meta-llama/Llama-2-7b-chat-hf"
-         
+
         # Set the path of the adapter
         adapter_name = "Llama-2-7b-enhanced-adpater"
-         
-         # Load base model 
+
+         # Load base model
         base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
-         
-         # Adapt the base model with the adapter 
+
+         # Adapt the base model with the adapter
         new_model = PeftModel.from_pretrained(base_model, adapter_name)
-         
+
         # Then, run generation as the same with a normal model outlined in 2.1

      The PEFT library provides a ``merge_and_unload`` method, which merges the adapter layers into the base model. This is
@@ -439,13 +438,13 @@ Let's look at achieving model inference using these types of models.

      .. code-block:: python

-         # Load base model 
+         # Load base model
         base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
-         
-         # Adapt the base model with the adapter 
+
+         # Adapt the base model with the adapter
         new_model = PeftModel.from_pretrained(base_model, adapter_name)
-         
-         # Merge adapter 
+
+         # Merge adapter
         model = model.merge_and_unload()

         # Save the merged model into local
@@ -461,25 +460,25 @@ Let's look at achieving model inference using these types of models.

         # Import relevant class for loading model and tokenizer
         from transformers import AutoTokenizer, AutoModelForCausalLM
-         
+
         # Set the pre-trained model name on Hugging face hub
         model_name = "meta-llama/Llama-2-7b-chat-hf"
-         
-         # Set device type 
+
+         # Set device type
         device = "cuda:0"
-         
-         # Load model and tokenizer 
+
+         # Load model and tokenizer
         model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
-         
-         # Input prompt encoding 
+
+         # Input prompt encoding
         query = "What is a large language model?"
         inputs = tokenizer.encode(query, return_tensors="pt").to(device)
-         
-         # Token generation  
-         outputs = model.generate(inputs) 
-         
-         # Outputs decoding 
+
+         # Token generation
+         outputs = model.generate(inputs)
+
+         # Outputs decoding
         print(tokenizer.decode(outputs[0]))

      In addition, pipelines from Transformers offer simple APIs to use pre-trained models for different tasks, including
@@ -490,14 +489,14 @@ Let's look at achieving model inference using these types of models.

         # Import relevant class for loading model and tokenizer
         from transformers import pipeline
-         
+
         # Set the path of your model or the name on Hugging face hub
         model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
-         
-         # Set pipeline 
+
+         # Set pipeline
         # A positive device value will run the model on associated CUDA device id
         pipe = pipeline("text-generation", model=model_name_or_path, device=0)
-         
+
         # Token generation
         print(pipe("What is a large language model?")[0]["generated_text"])

--- a/docs/how-to/rocm-for-ai/system-setup/prerequisite-system-validation.rst
+++ b/docs/how-to/rocm-for-ai/system-setup/prerequisite-system-validation.rst
@@ -31,16 +31,16 @@ in the Instinct documentation for more information.
 Hardware verification with ROCm
 -------------------------------

-Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
+Use the command ``amd-smi set --perf-determinism 1900`` to set the max clock speed up to 1900 MHz
 instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
 GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
-You can restore this setting to its default value with the ``rocm-smi -r`` command.
+You can restore this setting to its default value with the ``amd-smi reset --clocks`` command.

 Run the command:

 .. code-block:: shell

-   rocm-smi --setperfdeterminism 1900
+   amd-smi set --perf-determinism 1900

 See `Hardware verfication for ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_
 in the Instinct documentation for more information.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
@@ -108,16 +108,16 @@ for more information.
 Hardware verification with ROCm
 -------------------------------

-Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
+Use the command ``amd-smi set --perf-determinism 1900`` to set the max clock speed up to 1900 MHz
 instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
 GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
-You can restore this setting to its default value with the ``rocm-smi -r`` command.
+You can restore this setting to its default value with the ``amd-smi reset --clocks`` command.

 Run the command:

 .. code-block:: shell

-   rocm-smi --setperfdeterminism 1900
+   amd-smi set --perf-determinism 1900

 See `Hardware verification with ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_ for more information.

@@ -248,7 +248,7 @@ Download the Docker image and required packages
      Checking out this specific commit is recommended for a stable and reproducible environment.

      .. code-block:: shell
-         
+
         git checkout bb93ccbfeae6363c67b361a97a27c74ab86e7e92

 Prepare training datasets
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,4 +1,4 @@
-rocm-docs-core==1.31.1
+rocm-docs-core==1.30.0
 sphinx-reredirects
 sphinx-sitemap
 sphinxcontrib.datatemplates==0.11.0
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -132,7 +132,6 @@ nest-asyncio==1.6.0
 packaging==25.0
    # via
    #   ipykernel
-    #   pydata-sphinx-theme
    #   sphinx
 parso==0.8.5
    # via jedi
@@ -150,7 +149,7 @@ pure-eval==0.2.3
    # via stack-data
 pycparser==2.23
    # via cffi
-pydata-sphinx-theme==0.15.4
+pydata-sphinx-theme==0.16.1
    # via
    #   rocm-docs-core
    #   sphinx-book-theme
@@ -164,7 +163,7 @@ pygments==2.19.2
    #   sphinx
 pyjwt[crypto]==2.10.1
    # via pygithub
-pynacl==1.6.2
+pynacl==1.6.1
    # via pygithub
 python-dateutil==2.9.0.post0
    # via jupyter-client
@@ -188,7 +187,7 @@ requests==2.32.5
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.31.1
+rocm-docs-core==1.30.0
    # via -r requirements.in
 rpds-py==0.29.0
    # via
@@ -218,7 +217,7 @@ sphinx==8.1.3
    #   sphinx-reredirects
    #   sphinxcontrib-datatemplates
    #   sphinxcontrib-runcmd
-sphinx-book-theme==1.1.4
+sphinx-book-theme==1.1.3
    # via rocm-docs-core
 sphinx-copybutton==0.5.2
    # via rocm-docs-core
@@ -282,7 +281,7 @@ typing-extensions==4.15.0
    #   pygithub
    #   referencing
    #   sqlalchemy
-urllib3==2.6.3
+urllib3==2.5.0
    # via
    #   pygithub
    #   requests
--- a/docs/what-is-rocm.rst
+++ b/docs/what-is-rocm.rst
@@ -123,7 +123,8 @@ Performance

 .. note::

-  `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is a tool for visualizing and analyzing GPU thread trace data collected using :doc:`rocprofv3 <rocprofiler-sdk:index>`. Note that `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is in an early access state. Running production workloads is not recommended.
+  `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is a tool for visualizing and analyzing GPU thread trace data collected using :doc:`rocprofv3 <rocprofiler-sdk:index>`.
+  Note that `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is in an early access state. Running production workloads is not recommended.

 Development
 ^^^^^^^^^^^
Author	SHA1	Message	Date
anisha-amd	a98d6a5777	Docs: Ray release 25.12 and compatibility version format standardization (#5845 ) (#5846 )	2026-01-08 12:29:00 -05:00
Swati Rawat	38b271df55	Merge pull request #5843 from SwRaw/sw_cherrypick Cherrypicking amd-smi updates from ROCm internal	2026-01-08 20:33:14 +05:30
Swati Rawat	4184d1ee1f	Update docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst Co-authored-by: peterjunpark <git@peterjunpark.com>	2026-01-08 16:46:22 +05:30
Swati Rawat	0786c328c1	Update docs/how-to/rocm-for-ai/system-setup/prerequisite-system-validation.rst Co-authored-by: peterjunpark <git@peterjunpark.com>	2026-01-08 16:46:22 +05:30
Swati Rawat	88ea6072f5	Update docs/how-to/rocm-for-ai/system-setup/prerequisite-system-validation.rst Co-authored-by: peterjunpark <git@peterjunpark.com>	2026-01-08 16:46:22 +05:30
Swati Rawat	b32dcc8570	Update docs/how-to/rocm-for-ai/system-setup/prerequisite-system-validation.rst Co-authored-by: peterjunpark <git@peterjunpark.com>	2026-01-08 16:46:22 +05:30
Swati Rawat	0faa92e922	Update docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst Co-authored-by: peterjunpark <git@peterjunpark.com>	2026-01-08 16:46:21 +05:30
Swati Rawat	26ae989602	Update docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst Co-authored-by: peterjunpark <git@peterjunpark.com>	2026-01-08 16:46:21 +05:30
srawat	4402dc4147	Update single-gpu-fine-tuning-and-inference.rst	2026-01-08 16:46:21 +05:30
srawat	5eda438e0a	Update multi-gpu-fine-tuning-and-inference.rst	2026-01-08 16:46:20 +05:30
srawat	049784e1a7	Update prerequisite-system-validation.rst	2026-01-08 16:42:18 +05:30
srawat	f12169c5b7	replace rocm-smi reference with amd-smi	2026-01-08 16:42:18 +05:30
peterjunpark	b35d1a0627	fix(primus-pytorch.rst): FP8 config instead of BF16 (#5839 ) (cherry picked from commit `2dc22ca890`)	2026-01-07 13:51:50 -05:00
Pratik Basyal	912618cb08	ROCM-core version fixed (#5827 ) (#5828 )	2026-01-02 16:10:16 -05:00
peterjunpark	7d2feaa8b1	Fix inconsistency in xDiT doc (#5823 ) Fix inconsistency in xDiT doc (cherry picked from commit `172b0f7c08`)	2025-12-29 10:29:59 -05:00
peterjunpark	7d0d114994	Merge pull request #5821 from peterjunpark/docs/7.1.1 [docs/7.1.1] Add xDiT and Primus doc updates	2025-12-29 08:49:44 -05:00
peterjunpark	2a65394e32	Update docs for xDiT diffusion inference 25.13 Docker release (#5820 ) * archive previous version * add xdit 25.13 * update history index * add perf results section (cherry picked from commit `c67fac78bd`)	2025-12-29 08:45:29 -05:00
peterjunpark	268c1332c9	Update training docs for Primus/25.11 (#5819 ) * update conf and toc.yml.in * archive previous versions archive data files update anchors * primus pytorch: remove training batch size args * update primus megatron run cmds multi-node * update primus pytorch update * update update * update docker tag (cherry picked from commit `e0b8ec4dfb`)	2025-12-29 08:45:17 -05:00
Pratik Basyal	374e0944dc	OS table removed from compatibility table [develop] (#5810 ) (#5811 ) * OS table removed from compatibility table * Feedback added * Azure Linux 3.0 and compatibility version update * Version fix * Review feedback added * Minor change	2025-12-23 16:38:03 -05:00
peterjunpark	512e311041	Update xdit diffusion inference history (#5808 ) (#5809 ) * Update xdit diffusion inference history * fix (cherry picked from commit `3a43bacdda`)	2025-12-22 11:14:57 -05:00
peterjunpark	ad4f486635	fix link to ROCm PyT docker image (#5803 ) (#5804 ) (cherry picked from commit `48d8fe139b`)	2025-12-19 15:51:20 -05:00
peterjunpark	485886712b	clean up formatting in FA2 page (#5795 ) (#5796 ) (cherry picked from commit `7455fe57b8`)	2025-12-19 09:38:20 -05:00
peterjunpark	1cd6a14a22	Update Flash Attention guidance in "Model acceleration libraries" (#5793 ) * flash attention update Signed-off-by: seungrok.jung <seungrok.jung@amd.com> flash attention update Signed-off-by: seungrok.jung <seungrok.jung@amd.com> flash attention update Signed-off-by: seungrok.jung <seungrok.jung@amd.com> sentence-case heading * Update docs/how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> --------- Co-authored-by: seungrok.jung <seungrok.jung@amd.com> Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> (cherry picked from commit `52c0a47e84`)	2025-12-19 09:00:40 -05:00
peterjunpark	a17f04a3b5	Update documentation for JAX training MaxText 25.11 release (#5789 ) (#5790 ) (cherry picked from commit `cbab9a465d`)	2025-12-18 11:26:42 -05:00
peterjunpark	94de66ef3f	[docs/7.1.1] Publish vLLM and xDiT doc updates (#5787 ) * vLLM inference benchmark 1210 (#5776) * Archive previous ver fix anchors * Update vllm.rst and data yaml for 20251210 (cherry picked from commit `1b4f25733d`) * xDiT diffusion inference v25.12 documentation update (#5786) * Add xdit-diffusion ROCm docs page. * Update template formatting and fix sphinx warnings * Add System Validation section. * Add sw component versions/commits. * Update to use latest v25.10 image instead of v25.9 * Update commands and add FLUX instructions. * Update Flux instructions. Change image tag. Describe as diffusion inference instead of specifically video. * git rm xdit-video-diffusion.rst * Docs for v25.12 * Add hyperlinks to components * Command fixes * -Diffusers suffix * Simplify yaml file and cleanup main rst page. * Spelling, added 'js' * fix merge conflict fix --------- Co-authored-by: Kristoffer <kristoffer.torp@amd.com> (cherry picked from commit `459283da3c`) --------- Co-authored-by: Kristoffer <kristoffer.torp@amd.com>	2025-12-17 10:28:30 -05:00
Pratik Basyal	e5cebe7b4e	Taichi removed from ROCm docs [Develop] (#5779 ) (#5781 ) * Taichi removed from ROCm docs * Warnings fixed	2025-12-16 13:24:12 -05:00
Pratik Basyal	7047cfa19c	Onnx and rocshmem version updated (#5760 ) (#5764 )	2025-12-11 17:11:05 -05:00
Matt Williams	de71bf5fa7	Merge pull request #5759 from ROCm/cherry-pick-701 Fixing link redirects (#5758)	2025-12-10 11:39:53 -05:00
Matt Williams	0d17c96f7f	Fixing link redirects (#5758 ) * Update multi-gpu-fine-tuning-and-inference.rst * Update pytorch-training-v25.6.rst * Update pytorch-compatibility.rst	2025-12-10 11:31:26 -05:00
anisha-amd	2f8c99f7f0	Docs: update verl compatibility - fix (#5755 )	2025-12-09 19:52:12 -05:00
anisha-amd	982927e866	Docs: verl framework - compatibility - 25.11 release (#5752 ) (#5753 )	2025-12-09 12:02:20 -05:00
peterjunpark	8f45b791fe	Fix Primus PyTorch doc: training.batch_size -> training.local_batch_size (#5748 ) (#5749 ) (cherry picked from commit `bf74351e5a`)	2025-12-08 13:59:00 -05:00
yugang-amd	f7c7587b10	xdit-diffusion v25.11 docs (#5743 )	2025-12-05 17:08:21 -05:00
Pratik Basyal	96b3c0d4f3	PyTorch 2.7 support added (#5740 ) (#5741 )	2025-12-04 17:00:34 -05:00
peterjunpark	d6d4d2ef92	fix docker hub links for primus:v25.10 (#5738 ) (cherry picked from commit `453751a86f`)	2025-12-04 09:21:53 -05:00
peterjunpark	8647ebcf76	Update training Docker docs for Primus 25.10 (#5737 ) (cherry picked from commit `fb644412d5`)	2025-12-04 09:21:53 -05:00
Pratik Basyal	48ca38b0dc	Conflict resolved (#5735 )	2025-12-03 09:02:57 -05:00
Istvan Kiss	acbd671e99	JAX key features and enhancements (#5708 ) Co-authored-by: Pratik Basyal <prbasyal@amd.com>	2025-12-01 19:52:07 +01:00
Pratik Basyal	133a97ec18	711 post GA known issue update [docs/711] (#5723 ) * 7.1.1 known issues post GA (#5721) * rocblas known issues added * Minor change * Update RELEASE.md Co-authored-by: Jeffrey Novotny <jnovotny@amd.com> * Resolved * Update RELEASE.md Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> --------- Co-authored-by: Jeffrey Novotny <jnovotny@amd.com> Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * GitHub Issue added --------- Co-authored-by: Jeffrey Novotny <jnovotny@amd.com> Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>	2025-11-30 00:16:26 -05:00
Pratik Basyal	2d40066f29	Merged cell removed for coloring issue (#5713 ) (#5714 )	2025-11-27 20:03:11 -05:00
ROCm Docs Automation	5d7fdace0e	Update rocm-docs-core to 1.30.0	2025-11-26 17:09:50 -05:00
Istvan Kiss	7dbcdc5deb	Update release notes links ROCm 7.1.1 (#5705 )	2025-11-26 20:02:33 +01:00
Pratik Basyal	a966db29ca	Known issue from 7.1.0 removed (#5702 ) (#5703 )	2025-11-26 12:30:28 -05:00
Pratik Basyal	9ea8a48b3a	Link and PyTorch version updated (#5700 ) (#5701 )	2025-11-26 12:01:12 -05:00
Alex Xu	9956d72614	fix dependency	2025-11-26 11:42:22 -05:00
Alex Xu	305d24f486	Merge branch 'roc-7.1.x' into docs/7.1.1	2025-11-26 11:37:06 -05:00
Alex Xu	26f6b6b3e1	Merge branch 'roc-7.1.x' into docs/7.1.1	2025-11-26 11:29:02 -05:00
Alex Xu	d4cdbd79a3	Merge branch 'develop' into docs/7.1.1	2025-11-26 08:47:19 -05:00
alexxu-amd	26d1ab7d27	Update documentation requirements	2025-11-25 16:30:46 -05:00
alexxu-amd	272c9f6be3	Update documentation requirements	2025-11-25 15:37:04 -05:00