Apply suggestion from @randyh62

OK
2026-01-21 04:28:01 -05:00 · 2026-01-15 15:53:46 -08:00 · 2026-01-15 15:53:00 -08:00 · 2026-01-15 15:52:14 -08:00 · 2026-01-15 15:48:31 -08:00 · 2026-01-15 11:21:38 -05:00
15 changed files with 218 additions and 308 deletions
--- a/.azuredevops/components/HIP.yml
+++ b/.azuredevops/components/HIP.yml
@@ -34,6 +34,7 @@ parameters:
  default:
    - cmake
    - libnuma-dev
    - libsimde-dev
    - mesa-common-dev
    - ninja-build
    - ocl-icd-libopencl1
--- a/.azuredevops/components/origami.yml
+++ b/.azuredevops/components/origami.yml
@@ -39,6 +39,7 @@ parameters:
    - python3
    - python3-dev
    - python3-pip
    - python3-venv
    - libgtest-dev
    - libboost-filesystem-dev
    - libboost-program-options-dev
@@ -46,6 +47,8 @@ parameters:
  type: object
  default:
    - nanobind>=2.0.0
    - pytest
    - pytest-cov
 - name: rocmDependencies
  type: object
  default:
@@ -72,8 +75,10 @@ parameters:
      - { os: ubuntu2204, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      # - { os: ubuntu2204, packageManager: apt, target: gfx1151 }
      # - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
 - name: downstreamComponentMatrix
  type: object
  default:
@@ -116,6 +121,11 @@ jobs:
      parameters:
        dependencyList:
          - gtest
    - ${{ if ne(job.os, 'almalinux8') }}:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
        parameters:
          dependencyList:
            - catch2
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
@@ -137,6 +147,7 @@ jobs:
          -DORIGAMI_BUILD_SHARED_LIBS=ON
          -DORIGAMI_ENABLE_PYTHON=ON
          -DORIGAMI_BUILD_TESTING=ON
          -DORIGAMI_ENABLE_FETCH=ON
          -GNinja
    - ${{ if ne(job.os, 'almalinux8') }}:
      - task: PublishPipelineArtifact@1
@@ -169,7 +180,6 @@ jobs:
      dependsOn: origami_build_${{ job.os }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
          eq(${{ parameters.aggregatePipeline }}, False)
        )
@@ -180,30 +190,30 @@ jobs:
      workspace:
        clean: all
      steps:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
        parameters:
          checkoutRepo: ${{ parameters.checkoutRepo }}
          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          pipModules: ${{ parameters.pipModules }}
          packageManager: ${{ job.packageManager }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
        parameters:
          checkoutRepo: ${{ parameters.checkoutRepo }}
          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
        parameters:
          dependencyList:
            - gtest
      - ${{ if ne(job.os, 'almalinux8') }}:
        - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
          parameters:
            dependencyList:
              - catch2
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
          preTargetFilter: ${{ parameters.componentName }}
          os: ${{ job.os }}
      - task: DownloadPipelineArtifact@2
        displayName: 'Download Build Directory Artifact'
        inputs:
          artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
          path: '$(Agent.BuildDirectory)/s/build'
      - task: DownloadPipelineArtifact@2
        displayName: 'Download Python Source Artifact'
        inputs:
          artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
          path: '$(Agent.BuildDirectory)/s/python'
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
        parameters:
          checkoutRef: ${{ parameters.checkoutRef }}
@@ -212,25 +222,72 @@ jobs:
          gpuTarget: ${{ job.target }}
          ${{ if parameters.triggerDownstreamJobs }}:
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
      - task: CMake@1
        displayName: 'Origami Test CMake Configuration'
        inputs:
          cmakeArgs: >-
            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
            -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
            -DORIGAMI_BUILD_SHARED_LIBS=ON
            -DORIGAMI_ENABLE_PYTHON=ON
            -DORIGAMI_BUILD_TESTING=ON
            -GNinja
            $(Agent.BuildDirectory)/s
      - task: Bash@3
        displayName: 'Build Origami Tests and Python Bindings'
        inputs:
          targetType: inline
          workingDirectory: build
          script: |
            cmake --build . --target origami-tests origami_python -- -j$(nproc)
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
      # Run tests using CTest (discovers and runs both C++ and Python tests)
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
          os: ${{ job.os }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin'
+          testDir: 'build'
-          testExecutable: './origami-tests'
+          testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml'
-          testParameters: '--yaml origami-tests.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
+      # Test pip install workflow
-      - script: |
+      # - task: Bash@3
-          set -e
+      #   displayName: 'Test Pip Install'
-          export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
+      #   inputs:
-
+      #     targetType: inline
-          echo "--- Running origami_test.py ---"
+      #     script: |
-          python3 $(Agent.BuildDirectory)/s/python/origami_test.py
+      #       set -e
-          
+            
-          echo "--- Running origami_grid_test.py ---"
+      #       echo "==================================================================="
-          python3 $(Agent.BuildDirectory)/s/python/origami_grid_test.py
+      #       echo "Testing pip install workflow (pip install -e .)"
-        displayName: 'Run Python Binding Tests'
+      #       echo "==================================================================="
-        condition: succeeded()
+            
      #       # Set environment variables for pip install CMake build
      #       export ROCM_PATH=$(Agent.BuildDirectory)/rocm
      #       export CMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm:$(Agent.BuildDirectory)/vendor
      #       export CMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
      #       echo "ROCM_PATH: $ROCM_PATH"
      #       echo "CMAKE_PREFIX_PATH: $CMAKE_PREFIX_PATH"
      #       echo "CMAKE_CXX_COMPILER: $CMAKE_CXX_COMPILER"
      #       echo ""
      #       # Install from source directory
      #       cd "$(Agent.BuildDirectory)/s/python"
      #       pip install -e .
      #       # Verify import works
      #       echo ""
      #       echo "Verifying origami can be imported..."
      #       python3 -c "import origami; print('✓ Successfully imported origami')"
      #       # Run pytest on installed package
      #       echo ""
      #       echo "Running pytest tests..."
      #       python3 -m pytest tests/ -v -m "not slow" --tb=short
      #       echo ""
      #       echo "==================================================================="
      #       echo "Pip install test completed successfully"
      #       echo "==================================================================="
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -30,6 +30,7 @@ parameters:
    - python3-pip
    - protobuf-compiler
    - libprotoc-dev
    - libopencv-dev
 - name: pipModules
  type: object
  default:
@@ -64,6 +65,7 @@ parameters:
    - MIVisionX
    - rocm_smi_lib
    - rccl
    - rocAL
    - rocALUTION
    - rocBLAS
    - rocDecode
@@ -103,6 +105,7 @@ parameters:
    - MIVisionX
    - rocm_smi_lib
    - rccl
    - rocAL
    - rocALUTION
    - rocBLAS
    - rocDecode
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -36,7 +36,6 @@ Andrej
 Arb
 Autocast
 autograd
 Backported
 BARs
 BatchNorm
 BLAS
@@ -204,11 +203,9 @@ GenAI
 GenZ
 GitHub
 Gitpod
 hardcoded
 HBM
 HCA
 HGX
 HLO
 HIPCC
 hipDataType
 HIPExtension
@@ -336,7 +333,6 @@ MoEs
 Mooncake
 Mpops
 Multicore
 multihost
 Multithreaded
 mx
 MXFP
@@ -1031,7 +1027,6 @@ uncacheable
 uncorrectable
 underoptimized
 unhandled
 unfused
 uninstallation
 unmapped
 unsqueeze
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -270,26 +270,26 @@ The [ROCm examples repository](https://github.com/ROCm/rocm-examples) has been e
 :margin: auto 0 auto auto
 :::{grid}
 :margin: auto 0 auto auto
-* [hipBLASLt](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/hipBLASLt)
+* [hipBLASLt](https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/)
-* [hipSPARSE](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/hipSPARSE)
+* [hipSPARSE](https://rocm.docs.amd.com/projects/hipSPARSE/en/latest/)
-* [hipSPARSELt](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/hipSPARSELt)
+* [hipSPARSELt](https://rocm.docs.amd.com/projects/hipSPARSELt/en/latest/)
-* [hipTensor](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/hipTensor)
+* [hipTensor](https://rocm.docs.amd.com/projects/hipTensor/en/latest/)
 :::
 :::{grid}
 :margin: auto 0 auto auto
-* [rocALUTION](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/rocALUTION)
+* [rocALUTION](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/)
-* [ROCprofiler-SDK](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/rocProfiler-SDK)
+* [ROCprofiler-SDK](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/)
-* [rocWMMA](https://github.com/ROCm/rocm-examples/tree/amd-staging/Libraries/rocWMMA)
+* [rocWMMA](https://rocm.docs.amd.com/projects/rocWMMA/en/latest/)
 :::
 ::::
 Usage examples are now available for the following performance analysis tools:
-* [ROCm Compute Profiler](https://github.com/ROCm/rocm-examples/tree/amd-staging/Tools/rocprof-compute)
+* [ROCm Compute Profiler](https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/index.html)
-* [ROCm Systems Profiler](https://github.com/ROCm/rocm-examples/tree/amd-staging/Tools/rocprof-systems)
+* [ROCm Systems Profiler](https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/index.html)
-* [rocprofv3](https://github.com/ROCm/rocm-examples/tree/amd-staging/Tools/rocprofv3)
+* [rocprofv3](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/how-to/using-rocprofv3.html)
-The complete source code for the [HIP Graph Tutorial](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Tutorials/graph_api) is also available as part of the ROCm examples.
+The complete source code for the [HIP Graph Tutorial](https://rocm.docs.amd.com/projects/HIP/en/latest/tutorial/graph_api.html) is also available as part of the ROCm examples.
 ### ROCm documentation updates
--- a/docs/compatibility/ml-compatibility/jax-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst
@@ -269,33 +269,6 @@ For a complete and up-to-date list of JAX public modules (for example, ``jax.num
  JAX API modules are maintained by the JAX project and is subject to change.
  Refer to the official Jax documentation for the most up-to-date information.
 Key features and enhancements for ROCm 7.1
 ===============================================================================
 - Enabled compilation of multihost HLO runner Python bindings.
  - Backported multihost HLO runner bindings and some related changes to
    :code:`FunctionalHloRunner`.
  - Added :code:`requirements_lock_3_12` to enable building for Python 3.12.
 - Removed hardcoded NHWC convolution layout for ``fp16`` precision to address the performance drops for ``fp16`` precision on gfx12xx GPUs.
 - ROCprofiler-SDK integration:
  - Integrated ROCprofiler-SDK (v3) to XLA to improve profiling of GPU events,
    support both time-based and step-based profiling.
  - Added unit tests for :code:`rocm_collector` and :code:`rocm_tracer`.
 - Added Triton unsupported conversion from ``f8E4M3FNUZ`` to ``fp16`` with
  rounding mode.
 - Introduced :code:`CudnnFusedConvDecomposer` to revert fused convolutions
  when :code:`ConvAlgorithmPicker` fails to find a fused algorithm, and removed
  unfused fallback paths from :code:`RocmFusedConvRunner`.
 Key features and enhancements for ROCm 7.0
 ===============================================================================
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -268,3 +268,6 @@ html_context = {
    "granularity_type" : [('Coarse-grained', 'coarse-grained'), ('Fine-grained', 'fine-grained')],
    "scope_type" : [('Device', 'device'), ('System', 'system')]
 }
 # Disable figure and table numbering
 numfig = False
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -24,7 +24,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
      - GitHub
    * - :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`
-      - :doc:`link <rocm-install-on-linux:install/3rd-party/pytorch-install>`
+      - :doc:`Pytorch install <rocm-install-on-linux:install/3rd-party/pytorch-install>`
      - 
        - Docker image
        - Wheels package
@@ -35,7 +35,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
          <a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
    * - :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`
-      - :doc:`link <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
+      - :doc:`TensorFlow install <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
      - 
        - Docker image
        - Wheels package
@@ -45,7 +45,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
          <a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a> 
    * - :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`
-      - :doc:`link <rocm-install-on-linux:install/3rd-party/jax-install>`
+      - :doc:`JAX install <rocm-install-on-linux:install/3rd-party/jax-install>`
      - 
        - Docker image
      - .. raw:: html
@@ -53,7 +53,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
          <a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
    * - :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>`
-      - :doc:`link <rocm-install-on-linux:install/3rd-party/verl-install>`
+      - :doc:`verl install <rocm-install-on-linux:install/3rd-party/verl-install>`
      - 
        - Docker image
      - .. raw:: html
@@ -61,7 +61,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
          <a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>
    * - :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`
-      - :doc:`link <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
+      - :doc:`Stanford Megatron-LM install <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
      - 
        - Docker image
      - .. raw:: html
@@ -69,7 +69,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
          <a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
    * - :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>`
-      - :doc:`link <rocm-install-on-linux:install/3rd-party/dgl-install>`
+      - :doc:`DGL install <rocm-install-on-linux:install/3rd-party/dgl-install>`
      - 
        - Docker image
      - .. raw:: html
@@ -77,15 +77,24 @@ The table below summarizes information about ROCm-enabled deep learning framewor
          <a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a> 
    * - :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`
-      - :doc:`link <rocm-install-on-linux:install/3rd-party/megablocks-install>`
+      - :doc:`Megablocks install <rocm-install-on-linux:install/3rd-party/megablocks-install>`
      - 
        - Docker image
      - .. raw:: html
          <a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
    * - :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>`
      - `Taichi install <https://rocm.docs.amd.com/projects/taichi/en/latest/install/taichi-install.html>`__
      - 
        - Docker image
        - Wheels package
      - .. raw:: html
          <a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
    * - :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>`
-      - :doc:`link <rocm-install-on-linux:install/3rd-party/ray-install>`
+      - :doc:`Ray install <rocm-install-on-linux:install/3rd-party/ray-install>`
      - 
        - Docker image
        - Wheels package
@@ -95,7 +104,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
          <a href="https://github.com/ROCm/ray"><i class="fab fa-github fa-lg"></i></a>
    * - :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>`
-      - :doc:`link <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
+      - :doc:`llama.cpp install <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
      - 
        - Docker image
        - ROCm Base Docker image
@@ -104,7 +113,7 @@ The table below summarizes information about ROCm-enabled deep learning framewor
          <a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>
    * - :doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>`
-      - :doc:`link <rocm-install-on-linux:install/3rd-party/flashinfer-install>`
+      - :doc:`FlashInfer install <rocm-install-on-linux:install/3rd-party/flashinfer-install>`
      - 
        - Docker image
        - ROCm Base Docker image
--- a/docs/how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference.rst
@@ -44,7 +44,7 @@ Setting up the base implementation environment
   .. code-block:: shell
-      amd-smi static --board
+      rocm-smi --showproductname
 #. Check that your GPUs are available to PyTorch.
@@ -65,8 +65,8 @@ Setting up the base implementation environment
 .. tip::
-   During training and inference, you can check the memory usage by running the ``amd-smi`` command in your terminal.
+   During training and inference, you can check the memory usage by running the ``rocm-smi`` command in your terminal.
-   This tool helps you see which GPUs are involved.
+   This tool helps you see shows which GPUs are involved.
 .. _fine-tuning-llms-multi-gpu-hugging-face-accelerate:
@@ -91,10 +91,10 @@ Now, it's important to adjust how you load the model. Add the ``device_map`` par
   ...
   base_model_name = "meta-llama/Llama-2-7b-chat-hf"
-
+   
   # Load base model to GPU memory
   base_model = AutoModelForCausalLM.from_pretrained(
-           base_model_name,
+           base_model_name, 
           device_map = "auto",
           trust_remote_code = True)
   ...
@@ -130,7 +130,7 @@ After loading the model in this way, the model is fully ready to use the resourc
 torchtune for fine-tuning and inference
 =============================================
-`torchtune <https://pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-GPU
+`torchtune <https://meta-pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-GPU 
 model fine-tuning and inference with LLMs.
 #. Install torchtune using pip.
@@ -139,7 +139,7 @@ model fine-tuning and inference with LLMs.
      # Install torchtune with PyTorch release 2.2.2+
      pip install torchtune
-
+      
      # To confirm that the package is installed correctly
      tune --help
@@ -148,12 +148,12 @@ model fine-tuning and inference with LLMs.
   .. code-block:: shell
      usage: tune [-h] {download,ls,cp,run,validate} ...
-
+      
      Welcome to the TorchTune CLI!
-
+      
      options:
        -h, --help            show this help message and exit
-
+      
      subcommands:
        {download,ls,cp,run,validate}
@@ -194,11 +194,11 @@ model fine-tuning and inference with LLMs.
        apply_lora_to_output: False
        lora_rank: 8
        lora_alpha: 16
-
+      
      tokenizer:
        _component_: torchtune.models.llama2.llama2_tokenizer
        path: /tmp/Llama-2-7b-hf/tokenizer.model
-
+      
      # Dataset and sampler
      dataset:
        _component_: torchtune.datasets.alpaca_cleaned_dataset
--- a/docs/how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference.rst
@@ -44,19 +44,20 @@ Setting up the base implementation environment
   .. code-block:: shell
-      amd-smi static --board
+      rocm-smi --showproductname
   Your output should look like this:
   .. code-block:: shell
-      GPU: 0
+      ============================ ROCm System Management Interface ============================
-         BOARD:
+      ====================================== Product Info ======================================
-            MODEL_NUMBER: 102-G39203-0B
+      GPU[0]          : Card Series:          AMD Instinct MI300X OAM
-            PRODUCT_SERIAL: PCB079220-1150
+      GPU[0]          : Card model:           0x74a1
-            FRU_ID: 113-AMDG392030B04-100-300000097H
+      GPU[0]          : Card vendor:          Advanced Micro Devices, Inc. [AMD/ATI]
-            PRODUCT_NAME: AMD Instinct MI325 OAM
+      GPU[0]          : Card SKU:             MI3SRIOV
-            MANUFACTURER_NAME: AMD
+      ==========================================================================================
      ================================== End of ROCm SMI Log ===================================
 #. Check that your GPUs are available to PyTorch.
@@ -93,13 +94,13 @@ Setting up the base implementation environment
      pip install -r requirements-dev.txt
      cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S .
      python setup.py install
-
+      
      # To leverage the SFTTrainer in TRL for model fine-tuning.
      pip install trl
-
+      
      # To leverage PEFT for efficiently adapting pre-trained language models .
      pip install peft
-
+      
      # Install the other dependencies.
      pip install transformers datasets huggingface-hub scipy
@@ -131,7 +132,7 @@ Download the base model and fine-tuning dataset
   .. note::
-      You can also use the `NousResearch Llama-2-7b-chat-hf <https://huggingface.co/NousResearch/Llama-2-7b-chat-hf>`_
+      You can also use the `NousResearch Llama-2-7b-chat-hf <https://huggingface.co/NousResearch/Llama-2-7b-chat-hf>`_ 
      as a substitute. It has the same model weights as the original.
 #. Run the following code to load the base model and tokenizer.
@@ -140,14 +141,14 @@ Download the base model and fine-tuning dataset
      # Base model and tokenizer names.
      base_model_name = "meta-llama/Llama-2-7b-chat-hf"
-
+      
      # Load base model to GPU memory.
      device = "cuda:0"
      base_model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code = True).to(device)
-
+      
      # Load tokenizer.
      tokenizer = AutoTokenizer.from_pretrained(
-              base_model_name,
+              base_model_name, 
              trust_remote_code = True)
      tokenizer.pad_token = tokenizer.eos_token
      tokenizer.padding_side = "right"
@@ -161,10 +162,10 @@ Download the base model and fine-tuning dataset
      # Dataset for fine-tuning.
      training_dataset_name = "mlabonne/guanaco-llama2-1k"
      training_dataset = load_dataset(training_dataset_name, split = "train")
-
+      
      # Check the data.
      print(training_dataset)
-
+      
      # Dataset 11 is a QA sample in English.
      print(training_dataset[11])
@@ -251,8 +252,8 @@ Compare the number of trainable parameters and training time under the two diffe
                    dataset_text_field = "text",
                    tokenizer = tokenizer,
                    args = training_arguments
-            )
+            ) 
-
+            
            # Run the trainer.
            sft_trainer.train()
@@ -285,7 +286,7 @@ Compare the number of trainable parameters and training time under the two diffe
                    if param.requires_grad:
                        trainable_params += param.numel()
                print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}")
-
+            
            sft_trainer.peft_config = None
            print_trainable_parameters(sft_trainer.model)
@@ -308,8 +309,8 @@ Compare the number of trainable parameters and training time under the two diffe
                    dataset_text_field = "text",
                    tokenizer = tokenizer,
                    args = training_arguments
-            )
+            ) 
-
+            
            # Training.
            trainer_full.train()
@@ -348,7 +349,7 @@ store, and load.
         # PEFT adapter name.
         adapter_name = "llama-2-7b-enhanced-adapter"
-
+         
         # Save PEFT adapter.
         sft_trainer.model.save_pretrained(adapter_name)
@@ -358,21 +359,21 @@ store, and load.
         # Access adapter directory.
         cd llama-2-7b-enhanced-adapter
-
+         
         # List all adapter files.
         README.md  adapter_config.json  adapter_model.safetensors
   .. tab-item:: Saving a fully fine-tuned model
      :sync: without
-      If you're not using LoRA and PEFT so there is no PEFT LoRA configuration used for training, use the following code
+      If you're not using LoRA and PEFT so there is no PEFT LoRA configuration used for training, use the following code 
      to save your fine-tuned model to your system.
      .. code-block:: python
         # Fully fine-tuned model name.
         new_model_name = "llama-2-7b-enhanced"
-
+         
         # Save the fully fine-tuned model.
         full_trainer.model.save_pretrained(new_model_name)
@@ -382,7 +383,7 @@ store, and load.
         # Access new model directory.
         cd llama-2-7b-enhanced
-
+         
         # List all model files.
         config.json                       model-00002-of-00006.safetensors  model-00005-of-00006.safetensors
         generation_config.json            model-00003-of-00006.safetensors  model-00006-of-00006.safetensors
@@ -411,26 +412,26 @@ Let's look at achieving model inference using these types of models.
   .. tab-item:: Inference using PEFT adapters
-      To use PEFT adapters like a normal transformer model, you can run the generation by loading a base model along with PEFT
+      To use PEFT adapters like a normal transformer model, you can run the generation by loading a base model along with PEFT 
      adapters as follows.
      .. code-block:: python
         from peft import PeftModel
         from transformers import AutoModelForCausalLM
-
+         
         # Set the path of the model or the name on Hugging face hub
         base_model_name = "meta-llama/Llama-2-7b-chat-hf"
-
+         
         # Set the path of the adapter
         adapter_name = "Llama-2-7b-enhanced-adpater"
-
+         
-         # Load base model
+         # Load base model 
         base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
-
+         
-         # Adapt the base model with the adapter
+         # Adapt the base model with the adapter 
         new_model = PeftModel.from_pretrained(base_model, adapter_name)
-
+         
         # Then, run generation as the same with a normal model outlined in 2.1
      The PEFT library provides a ``merge_and_unload`` method, which merges the adapter layers into the base model. This is
@@ -438,13 +439,13 @@ Let's look at achieving model inference using these types of models.
      .. code-block:: python
-         # Load base model
+         # Load base model 
         base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
-
+         
-         # Adapt the base model with the adapter
+         # Adapt the base model with the adapter 
         new_model = PeftModel.from_pretrained(base_model, adapter_name)
-
+         
-         # Merge adapter
+         # Merge adapter 
         model = model.merge_and_unload()
         # Save the merged model into local
@@ -460,25 +461,25 @@ Let's look at achieving model inference using these types of models.
         # Import relevant class for loading model and tokenizer
         from transformers import AutoTokenizer, AutoModelForCausalLM
-
+         
         # Set the pre-trained model name on Hugging face hub
         model_name = "meta-llama/Llama-2-7b-chat-hf"
-
+         
-         # Set device type
+         # Set device type 
         device = "cuda:0"
-
+         
-         # Load model and tokenizer
+         # Load model and tokenizer 
         model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
-
+         
-         # Input prompt encoding
+         # Input prompt encoding 
         query = "What is a large language model?"
         inputs = tokenizer.encode(query, return_tensors="pt").to(device)
-
+         
-         # Token generation
+         # Token generation  
-         outputs = model.generate(inputs)
+         outputs = model.generate(inputs) 
-
+         
-         # Outputs decoding
+         # Outputs decoding 
         print(tokenizer.decode(outputs[0]))
      In addition, pipelines from Transformers offer simple APIs to use pre-trained models for different tasks, including
@@ -489,14 +490,14 @@ Let's look at achieving model inference using these types of models.
         # Import relevant class for loading model and tokenizer
         from transformers import pipeline
-
+         
         # Set the path of your model or the name on Hugging face hub
         model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
-
+         
-         # Set pipeline
+         # Set pipeline 
         # A positive device value will run the model on associated CUDA device id
         pipe = pipeline("text-generation", model=model_name_or_path, device=0)
-
+         
         # Token generation
         print(pipe("What is a large language model?")[0]["generated_text"])
--- a/docs/how-to/rocm-for-ai/system-setup/prerequisite-system-validation.rst
+++ b/docs/how-to/rocm-for-ai/system-setup/prerequisite-system-validation.rst
@@ -31,16 +31,16 @@ in the Instinct documentation for more information.
 Hardware verification with ROCm
 -------------------------------
-Use the command ``amd-smi set --perf-determinism 1900`` to set the max clock speed up to 1900 MHz
+Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
 instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
 GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
-You can restore this setting to its default value with the ``amd-smi reset --clocks`` command.
+You can restore this setting to its default value with the ``rocm-smi -r`` command.
 Run the command:
 .. code-block:: shell
-   amd-smi set --perf-determinism 1900
+   rocm-smi --setperfdeterminism 1900
 See `Hardware verfication for ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_
 in the Instinct documentation for more information.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
@@ -108,16 +108,16 @@ for more information.
 Hardware verification with ROCm
 -------------------------------
-Use the command ``amd-smi set --perf-determinism 1900`` to set the max clock speed up to 1900 MHz
+Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
 instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
 GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
-You can restore this setting to its default value with the ``amd-smi reset --clocks`` command.
+You can restore this setting to its default value with the ``rocm-smi -r`` command.
 Run the command:
 .. code-block:: shell
-   amd-smi set --perf-determinism 1900
+   rocm-smi --setperfdeterminism 1900
 See `Hardware verification with ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_ for more information.
@@ -248,7 +248,7 @@ Download the Docker image and required packages
      Checking out this specific commit is recommended for a stable and reproducible environment.
      .. code-block:: shell
-
+         
         git checkout bb93ccbfeae6363c67b361a97a27c74ab86e7e92
 Prepare training datasets
--- a/docs/reference/gpu-arch-specs.rst
+++ b/docs/reference/gpu-arch-specs.rst
@@ -5,7 +5,7 @@
 GPU hardware specifications
 ===========================================
-The following tables provide an overview of the hardware specifications for AMD Instinct™ GPUs, AMD Radeon™ PRO and Radeon™ GPUs, and AMD Ryzen™ APUs.
+The following tables provide an overview of the hardware specifications for AMD Instinct™ GPUs, and AMD Radeon™ PRO and Radeon™ GPUs.
 For more information about ROCm hardware compatibility, see the ROCm `Compatibility matrix <https://rocm.docs.amd.com/en/latest/compatibility/compatibility-matrix.html>`_.
@@ -18,7 +18,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
        :name: instinct-arch-spec-table
        *
-          - Name
+          - Model
          - Architecture
          - LLVM target name
          - VRAM (GiB)
@@ -297,7 +297,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
        :name: radeon-pro-arch-spec-table
        *
-          - Name
+          - Model
          - Architecture
          - LLVM target name
@@ -539,7 +539,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
        :name: radeon-arch-spec-table
        *
-          - Name
+          - Model
          - Architecture
          - LLVM target name
          - VRAM (GiB)
@@ -953,127 +953,6 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 9
          - 0
  .. tab-item:: AMD Ryzen APUs
    .. list-table::
        :header-rows: 1
        :name: ryzen-arch-spec-table
        *
          - Name
          - Graphics model
          - Architecture
          - LLVM target name
          - VRAM (GiB)
          - Compute Units
          - Wavefront Size
          - LDS (KiB)
          - Infinity Cache (MiB)
          - L2 Cache (MiB)
          - Graphics L1 Cache (KiB)
          - L0 Vector Cache (KiB)
          - L0 Scalar Cache (KiB)
          - L0 Instruction Cache (KiB)
          - VGPR File (KiB)
          - SGPR File (KiB)
          - GFXIP Major version
          - GFXIP Minor version
        *
          - AMD Ryzen 7 7840U
          - Radeon 780M
          - RDNA3
          - gfx1103
          - Dynamic + carveout
          - 12
          - 32 or 64
          - 128
          - N/A
          - 2
          - 256
          - 32
          - 16
          - 32
          - 512
          - 32
          - 11
          - 0
        *
          - AMD Ryzen 9 270
          - Radeon 780M
          - RDNA3
          - gfx1103
          - Dynamic + carveout
          - 12
          - 32 or 64
          - 128
          - N/A
          - 2
          - 256
          - 32
          - 16
          - 32
          - 512
          - 32
          - 11
          - 0
        *
          - AMD Ryzen AI 9 HX 375
          - Radeon 890M
          - RDNA3.5
          - gfx1150
          - Dynamic + carveout
          - 16
          - 32 or 64
          - 128
          - N/A
          - 2
          - 256
          - 32
          - 16
          - 32
          - 512
          - 32
          - 11
          - 5
        *
          - AMD Ryzen AI Max+ PRO 395
          - Radeon 8060S
          - RDNA3.5
          - gfx1151
          - Dynamic + carveout
          - 40
          - 32 or 64
          - 128
          - 32
          - 2
          - 256
          - 32
          - 16
          - 32
          - 768
          - 32
          - 11
          - 5
        *
          - AMD Ryzen Al 7 350
          - Radeon 860M
          - RDNA3.5
          - gfx1152
          - Dynamic + carveout
          - 8
          - 32 or 64
          - 128
          - N/A
          - 1
          - 256
          - 32
          - 16
          - 32
          - 512
          - 32
          - 11
          - 5
 Glossary
 ========
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -29,25 +29,27 @@ subtrees:
    title: Deep learning frameworks
    subtrees:
    - entries:
-      - file: compatibility/ml-compatibility/pytorch-compatibility.rst
+      - file: compatibility/ml-compatibility/pytorch-compatibility
        title: PyTorch compatibility
-      - file: compatibility/ml-compatibility/tensorflow-compatibility.rst
+      - file: compatibility/ml-compatibility/tensorflow-compatibility
        title: TensorFlow compatibility
-      - file: compatibility/ml-compatibility/jax-compatibility.rst
+      - file: compatibility/ml-compatibility/jax-compatibility
        title: JAX compatibility
-      - file: compatibility/ml-compatibility/verl-compatibility.rst
+      - file: compatibility/ml-compatibility/verl-compatibility
        title: verl compatibility
-      - file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
+      - file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility
        title: Stanford Megatron-LM compatibility
-      - file: compatibility/ml-compatibility/dgl-compatibility.rst
+      - file: compatibility/ml-compatibility/dgl-compatibility
        title: DGL compatibility
-      - file: compatibility/ml-compatibility/megablocks-compatibility.rst
+      - file: compatibility/ml-compatibility/megablocks-compatibility
        title: Megablocks compatibility
-      - file: compatibility/ml-compatibility/ray-compatibility.rst
+      - file: compatibility/ml-compatibility/taichi-compatibility
        title: Taichi compatibility
      - file: compatibility/ml-compatibility/ray-compatibility
        title: Ray compatibility
-      - file: compatibility/ml-compatibility/llama-cpp-compatibility.rst
+      - file: compatibility/ml-compatibility/llama-cpp-compatibility
        title: llama.cpp compatibility
-      - file: compatibility/ml-compatibility/flashinfer-compatibility.rst
+      - file: compatibility/ml-compatibility/flashinfer-compatibility
        title: FlashInfer compatibility
  - file: how-to/build-rocm.rst
    title: Build ROCm from source
@@ -75,14 +77,8 @@ subtrees:
        - entries:
          - file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
            title: Train a model with Primus and Megatron-LM
            entries:
            - file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
              title: Train a model with Megatron-LM
          - file: how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
            title: Train a model with Primus and PyTorch
            entries:
            - file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
              title: Train a model with PyTorch
          - file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
            title: Train a model with JAX MaxText
          - file: how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry
@@ -121,8 +117,6 @@ subtrees:
            title: SGLang inference performance testing
          - file: how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
            title: SGLang distributed inference with Mooncake
          - file: how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
            title: xDiT diffusion inference
          - file: how-to/rocm-for-ai/inference/deploy-your-model.rst
            title: Deploy your model
@@ -140,8 +134,6 @@ subtrees:
            title: Profile and debug
          - file: how-to/rocm-for-ai/inference-optimization/workload.rst
            title: Workload optimization
          - file: how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
            title: vLLM V1 performance optimization
      - url: https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/
        title: AI tutorials
@@ -188,7 +180,7 @@ subtrees:
          - file: conceptual/gpu-arch/mi300-mi200-performance-counters.rst
            title: MI300 and MI200 performance counters
          - file: conceptual/gpu-arch/mi350-performance-counters.rst
-            title: MI350 Series performance counters
+            title: MI350 series performance counters
      - file: conceptual/gpu-arch/mi250.md
        title: MI250 microarchitecture
        subtrees:
@@ -222,8 +214,6 @@ subtrees:
    title: ROCm tools, compilers, and runtimes
  - file: reference/gpu-arch-specs.rst
  - file: reference/gpu-atomics-operation.rst
  - file: reference/env-variables.rst
    title: Environment variables
  - file: reference/precision-support.rst
    title: Data types and precision support
  - file: reference/graph-safe-support.rst
--- a/docs/what-is-rocm.rst
+++ b/docs/what-is-rocm.rst
@@ -123,8 +123,7 @@ Performance
 .. note::
-  `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is a tool for visualizing and analyzing GPU thread trace data collected using :doc:`rocprofv3 <rocprofiler-sdk:index>`.
+  `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is a tool for visualizing and analyzing GPU thread trace data collected using :doc:`rocprofv3 <rocprofiler-sdk:index>`. Note that `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is in an early access state. Running production workloads is not recommended.
  Note that `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is in an early access state. Running production workloads is not recommended.
 Development
 ^^^^^^^^^^^
Author	SHA1	Message	Date
randyh62	8d183c2e95	Apply suggestion from @randyh62 OK	2026-01-15 15:53:46 -08:00
randyh62	3a7cfd3958	Apply suggestion from @randyh62 OK	2026-01-15 15:53:00 -08:00
randyh62	833fdf4c95	Apply suggestion from @randyh62 OK	2026-01-15 15:52:14 -08:00
randyh62	28f028d304	Update Deep Frameworks to use intersphinx links where possible	2026-01-15 15:48:31 -08:00
peterjunpark	a745e45dcb	Doc update for vLLM refactor #5855	2026-01-15 11:21:38 -05:00
alexxu-amd	8beac1891f	update requirements.txt (#5851 )	2026-01-14 16:55:26 -05:00
anisha-amd	773f5de407	Docs: Ray release 25.12 and compatibility version format standardization (#5845 )	2026-01-08 12:09:11 -05:00
dependabot[bot]	b297ced032	Bump urllib3 from 2.5.0 to 2.6.3 in /docs/sphinx (#5842 ) Bumps [urllib3](https://github.com/urllib3/urllib3) from 2.5.0 to 2.6.3. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/2.5.0...2.6.3) --- updated-dependencies: - dependency-name: urllib3 dependency-version: 2.6.3 dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2026-01-08 08:22:01 -05:00
peterjunpark	2dc22ca890	fix(primus-pytorch.rst): FP8 config instead of BF16 (#5839 )	2026-01-07 13:49:31 -05:00
Joseph Macaranas	85102079ed	[External CI] Add SIMDe dev package to HIP runtime pipeline (#5838 )	2026-01-07 11:00:38 -05:00
dependabot[bot]	ba95e0e689	Bump pynacl from 1.6.1 to 1.6.2 in /docs/sphinx (#5836 ) Bumps [pynacl](https://github.com/pyca/pynacl) from 1.6.1 to 1.6.2. - [Changelog](https://github.com/pyca/pynacl/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pyca/pynacl/compare/1.6.1...1.6.2) --- updated-dependencies: - dependency-name: pynacl dependency-version: 1.6.2 dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2026-01-06 14:10:42 -05:00
Pratik Basyal	1691d369e9	ROCM-core version fixed (#5827 )	2026-01-02 16:06:27 -05:00
peterjunpark	172b0f7c08	Fix inconsistency in xDiT doc Fix inconsistency in xDiT doc	2025-12-29 10:26:25 -05:00
peterjunpark	c67fac78bd	Update docs for xDiT diffusion inference 25.13 Docker release (#5820 ) * archive previous version * add xdit 25.13 * update history index * add perf results section	2025-12-29 08:44:45 -05:00
peterjunpark	e0b8ec4dfb	Update training docs for Primus/25.11 (#5819 ) * update conf and toc.yml.in * archive previous versions archive data files update anchors * primus pytorch: remove training batch size args * update primus megatron run cmds multi-node * update primus pytorch update * update update * update docker tag	2025-12-29 08:05:47 -05:00
Pratik Basyal	38f2d043dc	OS table removed from compatibility table [develop] (#5810 ) * OS table removed from compatibility table * Feedback added * Azure Linux 3.0 and compatibility version update * Version fix * Review feedback added * Minor change	2025-12-23 16:28:19 -05:00
peterjunpark	3a43bacdda	Update xdit diffusion inference history (#5808 ) * Update xdit diffusion inference history * fix	2025-12-22 11:05:32 -05:00
peterjunpark	48d8fe139b	fix link to ROCm PyT docker image (#5803 )	2025-12-19 15:47:55 -05:00
peterjunpark	7455fe57b8	clean up formatting in FA2 page (#5795 )	2025-12-19 09:21:41 -05:00
peterjunpark	52c0a47e84	Update Flash Attention guidance in "Model acceleration libraries" (#5793 ) * flash attention update Signed-off-by: seungrok.jung <seungrok.jung@amd.com> flash attention update Signed-off-by: seungrok.jung <seungrok.jung@amd.com> flash attention update Signed-off-by: seungrok.jung <seungrok.jung@amd.com> sentence-case heading * Update docs/how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> --------- Co-authored-by: seungrok.jung <seungrok.jung@amd.com> Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>	2025-12-19 08:48:52 -05:00
peterjunpark	cbab9a465d	Update documentation for JAX training MaxText 25.11 release (#5789 )	2025-12-18 11:23:58 -05:00
peterjunpark	459283da3c	xDiT diffusion inference v25.12 documentation update (#5786 ) * Add xdit-diffusion ROCm docs page. * Update template formatting and fix sphinx warnings * Add System Validation section. * Add sw component versions/commits. * Update to use latest v25.10 image instead of v25.9 * Update commands and add FLUX instructions. * Update Flux instructions. Change image tag. Describe as diffusion inference instead of specifically video. * git rm xdit-video-diffusion.rst * Docs for v25.12 * Add hyperlinks to components * Command fixes * -Diffusers suffix * Simplify yaml file and cleanup main rst page. * Spelling, added 'js' * fix merge conflict fix --------- Co-authored-by: Kristoffer <kristoffer.torp@amd.com>	2025-12-17 10:20:10 -05:00
peterjunpark	1b4f25733d	vLLM inference benchmark 1210 (#5776 ) * Archive previous ver fix anchors * Update vllm.rst and data yaml for 20251210	2025-12-17 09:21:57 -05:00
Ibrahim Wani	b287372be5	[origami] Test update (#5768 ) * Fix the skipping of origami tests * Update dependencies for origami refactor * test * Unsupress test output. * Ctest implementation * Test ctest * Test ctest 2 * Add pip install test * Fix python version * Add python dep * test * test 2 * Debug for readme * Fix pip install * Fix pip install 2 * Clean up * Run tests on 950 * Replace 950 with 1201 * 1101 * Add more archs * Add more archs 2 * Comment out archs * Move pip install script to ./azuredevops/scripts * Fix path * Fix path 2 * Fix path 3 * Fix path 4 * Remove pip install testing: * Use inline script * Add old deps	2025-12-16 15:37:41 -07:00
Pratik Basyal	78e8baf147	Taichi removed from ROCm docs [Develop] (#5779 ) * Taichi removed from ROCm docs * Warnings fixed	2025-12-16 13:12:40 -05:00
Matt Williams	3e0c8b47e3	Merge pull request #5771 from ROCm/mattwill-amd-patch-4 Reverting Optiq note	2025-12-12 17:53:41 -05:00
Matt Williams	c3f0b99cc0	Reverting Optiq note	2025-12-12 17:47:33 -05:00
dependabot[bot]	c9d1679486	Bump rocm-docs-core from 1.31.0 to 1.31.1 in /docs/sphinx Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.31.0 to 1.31.1. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.31.0...v1.31.1) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-version: 1.31.1 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2025-12-12 16:15:26 -05:00
Pratik Basyal	fdbef17d7b	Onnx and rocshmem version updated (#5760 )	2025-12-11 17:05:25 -05:00
Matt Williams	6592a41a7f	Adding ROCm-Optiq note to What is ROCm page (#5709 ) * Adding ROCm-Optiq note to What is ROCm page Adding a note for a link to the Optiq docs * Apply suggestion from @mattwill-amd * Apply suggestion from @mattwill-amd * Apply suggestion from @mattwill-amd * Update what-is-rocm.rst * Update what-is-rocm.rst * Apply suggestion from @mattwill-amd * Apply suggestion from @mattwill-amd * Apply suggestion from @mattwill-amd * Apply suggestion from @mattwill-amd	2025-12-10 12:56:33 -08:00
Matt Williams	65a936023b	Fixing link redirects (#5758 ) * Update multi-gpu-fine-tuning-and-inference.rst * Update pytorch-training-v25.6.rst * Update pytorch-compatibility.rst	2025-12-10 11:17:59 -05:00
anisha-amd	2a64949081	Docs: update verl compatibility - fix (#5756 )	2025-12-09 19:51:37 -05:00
anisha-amd	0a17434517	Docs: update verl compatibility - fix (#5754 )	2025-12-09 18:36:16 -05:00
anisha-amd	2be7e5ac1e	Docs: verl framework - compatibility - 25.11 release (#5752 )	2025-12-09 11:41:43 -05:00
dependabot[bot]	ae80c4a31c	Bump rocm-docs-core from 1.30.1 to 1.31.0 in /docs/sphinx (#5751 ) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.30.1 to 1.31.0. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/v1.31.0/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.30.1...v1.31.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-version: 1.31.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-12-09 08:25:16 -05:00
Adel Johar	dd89a692e1	[Ex CI] Add rocAL dependencies	2025-12-09 10:56:23 +01:00
peterjunpark	bf74351e5a	Fix Primus PyTorch doc: training.batch_size -> training.local_batch_size (#5748 )	2025-12-08 13:35:22 -05:00
yugang-amd	f2067767e0	xdit-diffusion v25.11 docs (#5744 )	2025-12-05 17:09:48 -05:00
Pratik Basyal	effd4174fb	PyTorch 2.7 support added (#5740 )	2025-12-04 15:49:23 -05:00
peterjunpark	453751a86f	fix docker hub links for primus:v25.10 (#5738 )	2025-12-04 09:17:33 -05:00
peterjunpark	fb644412d5	Update training Docker docs for Primus 25.10 (#5737 )	2025-12-04 09:08:00 -05:00
Pratik Basyal	e8fdc34b71	711 hipBLASLT performance decline known issue added (#5730 ) * hipBLASLT performance decline known issue added * Update RELEASE.md Co-authored-by: Jeffrey Novotny <jnovotny@amd.com> * GitHub Issue added * Ram's feedback incorporated * GitHub Issue added * Update RELEASE.md Co-authored-by: Jeffrey Novotny <jnovotny@amd.com> --------- Co-authored-by: Jeffrey Novotny <jnovotny@amd.com>	2025-12-03 08:50:25 -05:00
Pratik Basyal	b4031ef23c	7.1.1 known issues post GA (#5721 ) * rocblas known issues added * Minor change * Update RELEASE.md Co-authored-by: Jeffrey Novotny <jnovotny@amd.com> * Resolved * Update RELEASE.md Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> --------- Co-authored-by: Jeffrey Novotny <jnovotny@amd.com> Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>	2025-11-28 16:34:47 -05:00
dependabot[bot]	d0bd4e6f03	Bump rocm-docs-core from 1.29.0 to 1.30.1 in /docs/sphinx (#5712 ) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.29.0 to 1.30.1. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.29.0...v1.30.1) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-version: 1.30.1 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-11-28 08:18:23 -05:00
Jan Stephan	0056b9453e	Remove continuous numbering of tables and figures Signed-off-by: Jan Stephan <jan.stephan@amd.com>	2025-11-28 10:29:01 +01:00
Pratik Basyal	3d1ad79766	Merged cell removed for coloring issue (#5713 )	2025-11-27 19:52:36 -05:00
Pratik Basyal	8683bed11b	Known issue from 7.1.0 removed (#5702 )	2025-11-26 12:27:22 -05:00
Pratik Basyal	847cd7c423	Link and PyTorch version updated (#5700 )	2025-11-26 11:52:47 -05:00