adding aqlprofile

changing filename
changing revision
2026-01-11 07:38:17 -05:00 · 2025-09-12 03:29:38 -07:00 · 2025-09-12 00:10:51 -07:00 · 2025-09-12 00:04:26 -07:00 · 2025-09-11 06:47:12 -07:00 · 2025-09-11 05:20:51 -07:00
25 changed files with 1580 additions and 350 deletions
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -178,7 +178,7 @@ jobs:
          mkdir -p $(Agent.BuildDirectory)/temp-deps
          cd $(Agent.BuildDirectory)/temp-deps
          # position-independent LAPACK is required for almalinux8 builds
-          cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
+          cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/sparse/projects/hipblaslt/deps
          make -j
          sudo make install
    - script: |
@@ -197,6 +197,8 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
+        cmakeSourceDir: $(Agent.BuildDirectory)/sparse/projects/hipblaslt
+        cmakeBuildDir: $(Agent.BuildDirectory)/sparse/projects/hipblaslt/build
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
          -DCMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
--- a/.azuredevops/components/origami.yml
+++ b/.azuredevops/components/origami.yml
@@ -0,0 +1,236 @@
+parameters:
+- name: componentName
+  type: string
+  default: origami
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
+# set to true if doing full build of ROCm stack
+# and dependencies are pulled from same pipeline
+- name: aggregatePipeline
+  type: boolean
+  default: false
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+    - wget
+    - python3
+    - python3-dev
+    - python3-pip
+- name: pipModules
+  type: object
+  default:
+    - nanobind>=2.0.0
+- name: rocmDependencies
+  type: object
+  default:
+    - clr
+    - llvm-project
+    - rocm-cmake
+    - rocminfo
+    - ROCR-Runtime
+    - rocprofiler-register
+- name: rocmTestDependencies
+  type: object
+  default:
+    - clr
+    - llvm-project
+    - rocm-cmake
+    - rocminfo
+    - ROCR-Runtime
+    - rocprofiler-register
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
+    testJobs:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - hipBLASLt:
+      name: hipBLASLt
+      sparseCheckoutDir: projects/hipblaslt
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - origami_build
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: origami_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+          -DORIGAMI_BUILD_SHARED_LIBS=ON
+          -DORIGAMI_ENABLE_PYTHON=ON
+          -DORIGAMI_BUILD_TESTING=ON
+          -GNinja
+    - ${{ if ne(job.os, 'almalinux8') }}:
+      - task: PublishPipelineArtifact@1
+        displayName: 'Publish Build Directory Artifact'
+        inputs:
+          targetPath: '$(Agent.BuildDirectory)/s/build'
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
+          publishLocation: 'pipeline'
+      - task: PublishPipelineArtifact@1
+        displayName: 'Publish Python Source Artifact'
+        inputs:
+          targetPath: '$(Agent.BuildDirectory)/s/python'
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
+          publishLocation: 'pipeline'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+        componentName: ${{ parameters.componentName }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: origami_test_${{ job.os }}_${{ job.target }}
+      timeoutInMinutes: 120
+      dependsOn: origami_build_${{ job.os }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+      - task: DownloadPipelineArtifact@2
+        displayName: 'Download Build Directory Artifact'
+        inputs:
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
+          path: '$(Agent.BuildDirectory)/s/build'
+      - task: DownloadPipelineArtifact@2
+        displayName: 'Download Python Source Artifact'
+        inputs:
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
+          path: '$(Agent.BuildDirectory)/s/python'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - script: |
+          export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
+
+          echo "--- Running origami_test.py ---"
+          python3 $(Agent.BuildDirectory)/s/python/origami_test.py
+          
+          echo "--- Running origami_grid_test.py ---"
+          python3 $(Agent.BuildDirectory)/s/python/origami_grid_test.py
+        displayName: 'Run Python Binding Tests'
+        condition: succeeded()
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          buildDependsOn: ${{ component.buildDependsOn }}
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocDecode.yml
+++ b/.azuredevops/components/rocDecode.yml
@@ -8,6 +8,9 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+- name: rocPyDecodeRepo
+  type: string
+  default: rocpydecode_repo
 # monorepo related parameters
 - name: sparseCheckoutDir
  type: string
@@ -207,7 +210,7 @@ jobs:
    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
        parameters:
-          checkoutRepo: ${{ parameters.checkoutRepo }}
+          checkoutRepo: ${{ parameters.rocPyDecodeRepo }}
          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
          buildDependsOn: ${{ component.buildDependsOn }}
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
--- a/.azuredevops/components/rocm-core.yml
+++ b/.azuredevops/components/rocm-core.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocm-core
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -27,6 +46,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocm_core_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+        dependsOn:
+          - ${{ each build in parameters.buildDependsOn }}:
+            - ${{ build }}_${{ job.os }}
    pool:
      ${{ if eq(job.os, 'ubuntu2404') }}:
        vmImage: 'ubuntu-24.04'
@@ -50,8 +73,10 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
@@ -65,9 +90,12 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
--- a/.azuredevops/components/rocm_smi_lib.yml
+++ b/.azuredevops/components/rocm_smi_lib.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocm-smi-lib
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -32,6 +51,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocm_smi_lib_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    pool:
      ${{ if eq(job.os, 'ubuntu2404') }}:
        vmImage: 'ubuntu-24.04'
@@ -55,8 +78,10 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
@@ -65,51 +90,56 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocm_smi_lib_test_${{ job.os }}_${{ job.target }}
-    dependsOn: rocm_smi_lib_build_${{ job.os }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      parameters:
-        runRocminfo: false
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocm_smi_lib
-        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
-        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: rocm_smi_lib_test_${{ job.os }}_${{ job.target }}
+      dependsOn: rocm_smi_lib_build_${{ job.os }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+        parameters:
+          runRocminfo: false
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)'
+          testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
+          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocminfo.yml
+++ b/.azuredevops/components/rocminfo.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocminfo
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -40,7 +59,11 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocminfo_build_${{ job.os }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    pool:
      vmImage: 'ubuntu-22.04'
    ${{ if eq(job.os, 'almalinux8') }}:
@@ -62,14 +85,18 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
@@ -78,65 +105,71 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocminfo_test_${{ job.target }}
-    dependsOn: rocminfo_build_${{ job.os }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-        registerROCmPackages: true
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      parameters:
-        runRocminfo: false
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocminfo
-        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: './rocm/bin/rocminfo'
-        testParameters: ''
-        testPublishResults: false
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocm_agent_enumerator
-        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: './rocm/bin/rocm_agent_enumerator'
-        testParameters: ''
-        testPublishResults: false
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        registerROCmPackages: true
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: rocminfo_test_${{ job.target }}
+      dependsOn: rocminfo_build_${{ job.os }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+          registerROCmPackages: true
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+        parameters:
+          runRocminfo: false
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)'
+          testExecutable: './rocm/bin/rocminfo'
+          testParameters: ''
+          testPublishResults: false
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: rocm_agent_enumerator
+          testDir: '$(Agent.BuildDirectory)'
+          testExecutable: './rocm/bin/rocm_agent_enumerator'
+          testParameters: ''
+          testPublishResults: false
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          registerROCmPackages: true
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -55,6 +55,7 @@ parameters:
    - pymongo
    - pyyaml
    - setuptools
+    - sqlalchemy
    - tabulate
    - textual
    - textual_plotext
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -171,16 +171,16 @@ parameters:
      developBranch: develop
      hasGpuTarget: false
    rocm-core:
-      pipelineId: 103
-      developBranch: master
+      pipelineId: 349
+      developBranch: develop
      hasGpuTarget: false
    rocm-examples:
      pipelineId: 216
      developBranch: amd-staging
      hasGpuTarget: true
    rocminfo:
-      pipelineId: 91
-      developBranch: amd-staging
+      pipelineId: 356
+      developBranch: develop
      hasGpuTarget: false
    rocMLIR:
      pipelineId: 229
@@ -251,8 +251,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    roctracer:
-      pipelineId: 141
-      developBranch: amd-staging
+      pipelineId: 331
+      developBranch: develop
      hasGpuTarget: true
    rocWMMA:
      pipelineId: 109
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -156,6 +156,7 @@ GEMMs
 GFLOPS
 GFortran
 GFXIP
+GGUF
 Gemma
 GiB
 GIM
@@ -293,6 +294,7 @@ Multicore
 Multithreaded
 MyEnvironment
 MyST
+NANOO
 NBIO
 NBIOs
 NCCL
@@ -500,6 +502,7 @@ Unhandled
 VALU
 VBIOS
 VCN
+verl's
 VGPR
 VGPRs
 VM
@@ -742,6 +745,7 @@ logits
 lossy
 macOS
 matchers
+maxtext
 megatron
 microarchitecture
 migraphx
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -57,9 +57,8 @@ ROCm documentation continues to be updated to provide clearer and more comprehen
 
    For more information about the changes, see [Changelog for the AI Developer Hub](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/changelog.html).

-* ROCm provides a comprehensive ecosystem for deep learning development. For more details, see [Deep learning frameworks for ROCm](https://rocm.docs.amd.com/en/docs-6.4.3/how-to/deep-learning-rocm.html). AMD ROCm adds support for the following deep learning frameworks:
+* ROCm provides a comprehensive ecosystem for deep learning development. For more details, see [Deep learning frameworks for ROCm](https://rocm.docs.amd.com/en/docs-6.4.3/how-to/deep-learning-rocm.html). AMD ROCm adds support for the following deep learning framework:

-    * Taichi is an open-source, imperative, and parallel programming language designed for high-performance numerical computation. Embedded in Python, it leverages just-in-time (JIT) compilation frameworks such as LLVM to accelerate compute-intensive Python code by compiling it to native GPU or CPU instructions. It is currently supported on ROCm 6.3.2. For more information, see [Taichi compatibility](https://rocm.docs.amd.com/en/docs-6.4.3/compatibility/ml-compatibility/taichi-compatibility.html).
    * Megablocks is a light-weight library for mixture-of-experts (MoE) training. The core of the system is efficient "dropless-MoE" and standard MoE layers. Megablocks is integrated with Megatron-LM, where data and pipeline parallel training of MoEs is supported. It is currently supported on ROCm 6.3.0. For more information, see [Megablocks compatibility](https://rocm.docs.amd.com/en/docs-6.4.3/compatibility/ml-compatibility/megablocks-compatibility.html).

 * The [Data types and precision support](https://rocm.docs.amd.com/en/latest/reference/precision-support.html) topic now includes new hardware and library support information.
--- a/default.xml
+++ b/default.xml
@@ -1,12 +1,12 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-6.4.3"
+    <default revision="refs/tags/20250912-42"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
 <!--list of projects for ROCm-->
-    <project name="ROCK-Kernel-Driver" />
+    <project name="aqlprofile" />
    <project name="ROCR-Runtime" />
    <project name="amdsmi" />
    <project name="rdc" />
@@ -37,36 +37,37 @@
    <project name="rocr_debug_agent" />
 <!-- ROCm Libraries -->
    <project groups="mathlibs" name="AMDMIGraphX" />
-    <project groups="mathlibs" name="MIOpen" />
    <project groups="mathlibs" name="MIVisionX" />
    <project groups="mathlibs" name="ROCmValidationSuite" />
-    <project groups="mathlibs" name="Tensile" />
    <project groups="mathlibs" name="composable_kernel" />
-    <project groups="mathlibs" name="hipBLAS-common" />
-    <project groups="mathlibs" name="hipBLAS" />
-    <project groups="mathlibs" name="hipBLASLt" />
-    <project groups="mathlibs" name="hipCUB" />
-    <project groups="mathlibs" name="hipFFT" />
-    <project groups="mathlibs" name="hipRAND" />
    <project groups="mathlibs" name="hipSOLVER" />
-    <project groups="mathlibs" name="hipSPARSE" />
-    <project groups="mathlibs" name="hipSPARSELt" />
    <project groups="mathlibs" name="hipTensor" />
    <project groups="mathlibs" name="hipfort" />
    <project groups="mathlibs" name="rccl" />
    <project groups="mathlibs" name="rocAL" />
    <project groups="mathlibs" name="rocALUTION" />
-    <project groups="mathlibs" name="rocBLAS" />
    <project groups="mathlibs" name="rocDecode" />
    <project groups="mathlibs" name="rocJPEG" />
+    <project groups="mathlibs" name="rocm-libraries">
+        <linkfile src="projects/hipcub" dest="hipCUB"/>
+        <linkfile src="projects/rocprim" dest="rocPRIM"/>
+        <linkfile src="projects/hiprand" dest="hipRAND"/>
+        <linkfile src="projects/rocrand" dest="rocRAND"/>
+        <linkfile src="projects/rocthrust" dest="rocThrust"/>
+        <linkfile src="projects/hipblas-common" dest="hipBLAS-common"/>
+        <linkfile src="projects/hipblaslt" dest="hipBLASLt"/>
+        <linkfile src="projects/rocblas" dest="rocBLAS"/>
+        <linkfile src="projects/hipsparselt" dest="hipSPARSELt"/>
+        <linkfile src="projects/rocsparse" dest="rocSPARSE"/>
+        <linkfile src="projects/hipsparse" dest="hipSPARSE"/>
+        <linkfile src="projects/hipblas" dest="hipBLAS"/>
+        <linkfile src="projects/miopen" dest="MIOpen"/>
+        <linkfile src="projects/hipfft" dest="hipFFT"/>
+        <linkfile src="projects/rocfft" dest="rocFFT"/>
+    </project>
    <project groups="mathlibs" name="rocPyDecode" />
-    <project groups="mathlibs" name="rocFFT" />
-    <project groups="mathlibs" name="rocPRIM" />
-    <project groups="mathlibs" name="rocRAND" />
    <project groups="mathlibs" name="rocSHMEM" />
    <project groups="mathlibs" name="rocSOLVER" />
-    <project groups="mathlibs" name="rocSPARSE" />
-    <project groups="mathlibs" name="rocThrust" />
    <project groups="mathlibs" name="rocWMMA" />
    <project groups="mathlibs" name="rocm-cmake" />
    <project groups="mathlibs" name="rpp" />
--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -35,6 +35,8 @@ ROCm Version,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6
      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat]_,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,N/A,N/A,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
 ,,,,,,,,,,,,,,,,,,
      ,,,,,,,,,,,,,,,,,,
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -246,6 +246,8 @@ Expand for full historical view of:
   .. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
   .. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
   .. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
+   .. [#ray_compat] Ray is only supported on ROCm 6.4.1.
+   .. [#llama-cpp_compat] llama.cpp is only supported on ROCm 6.4.0.
   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
   
--- a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
@@ -0,0 +1,156 @@
+:orphan:
+
+.. meta::
+    :description: llama.cpp deep learning framework compatibility
+    :keywords: GPU, GGML, llama.cpp compatibility
+
+.. version-set:: rocm_version latest
+
+********************************************************************************
+llama.cpp compatibility
+********************************************************************************
+
+`llama.cpp <https://github.com/ggml-org/llama.cpp>`__ is an open-source framework 
+for Large Language Model (LLM) inference that runs on both central processing units 
+(CPUs) and graphics processing units (GPUs). It is written in plain C/C++, providing 
+a simple, dependency-free setup. 
+
+The framework supports multiple quantization options, from 1.5-bit to 8-bit integers, 
+to speed up inference and reduce memory usage. Originally built as a CPU-first library, 
+llama.cpp is easy to integrate with other programming environments and is widely 
+adopted across diverse platforms, including consumer devices. 
+
+ROCm support for llama.cpp is upstreamed, and you can build the official source code
+with ROCm support:
+
+- ROCm support for llama.cpp is hosted in the official `https://github.com/ROCm/llama.cpp 
+  <https://github.com/ROCm/llama.cpp>`_ repository.
+
+- Due to independent compatibility considerations, this location differs from the 
+  `https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`_ upstream repository.
+
+- To install llama.cpp, use the prebuilt :ref:`Docker image <llama-cpp-docker-compat>`, 
+  which includes ROCm, llama.cpp, and all required dependencies.
+
+  - See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>` 
+    to install and get started.
+
+  - See the `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip>`__ 
+    in the upstream llama.cpp documentation.
+
+.. note::
+
+  llama.cpp is supported on ROCm 6.4.0.
+
+Supported devices
+================================================================================
+
+**Officially Supported**: AMD Instinct™ MI300X, MI210
+
+
+Use cases and recommendations
+================================================================================
+
+llama.cpp can be applied in a variety of scenarios, particularly when you need to meet one or more of the following requirements:
+
+- Plain C/C++ implementation with no external dependencies
+- Support for 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory usage
+- Custom HIP (Heterogeneous-compute Interface for Portability) kernels for running large language models (LLMs) on AMD GPUs (graphics processing units)
+- CPU (central processing unit) + GPU (graphics processing unit) hybrid inference for partially accelerating models larger than the total available VRAM (video random-access memory)
+
+llama.cpp is also used in a range of real-world applications, including:
+
+- Games such as `Lucy's Labyrinth <https://github.com/MorganRO8/Lucys_Labyrinth>`__:
+  A simple maze game where AI-controlled agents attempt to trick the player.
+- Tools such as `Styled Lines <https://marketplace.unity.com/packages/tools/ai-ml-integration/style-text-webgl-ios-stand-alone-llm-llama-cpp-wrapper-292902>`__:
+  A proprietary, asynchronous inference wrapper for Unity3D game development, including pre-built mobile and web platform wrappers and a model example.
+- Various other AI applications use llama.cpp as their inference engine;  
+  for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
+
+For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
+where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
+
+- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__, 
+  blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``, 
+  server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for 
+  AMD Instinct GPUs within the ROCm ecosystem. 
+
+.. _llama-cpp-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp>`__
+with ROCm backends on Docker Hub. The following Docker image tags and associated
+inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
+Click |docker-icon| to view the image on Docker Hub.
+
+.. important::
+
+   Tag endings of ``_full``, ``_server``, and ``_light`` serve different purposes for entrypoints as follows:
+
+   - Full: This image includes both the main executable file and the tools to convert ``LLaMA`` models into ``ggml`` and convert into 4-bit quantization.
+   - Server: This image only includes the server executable file.
+   - Light: This image only includes the main executable file.
+
+.. list-table::
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Full Docker
+      - Server Docker
+      - Light Docker
+      - llama.cpp
+      - Ubuntu
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_full/images/sha256-f78f6c81ab2f8e957469415fe2370a1334fe969c381d1fe46050c85effaee9d5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_server/images/sha256-275ad9e18f292c26a00a2de840c37917e98737a88a3520bdc35fd3fc5c9a6a9b"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_light/images/sha256-cc324e6faeedf0e400011f07b49d2dc41a16bae257b2b7befa0f4e2e97231320"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - `b5997 <https://github.com/ROCm/llama.cpp/tree/release/b5997>`__
+      - 24.04
+
+Key ROCm libraries for llama.cpp
+================================================================================
+
+llama.cpp functionality on ROCm is determined by its underlying library
+dependencies. These ROCm components affect the capabilities, performance, and
+feature set available to developers.
+
+.. list-table::
+    :header-rows: 1
+
+    * - ROCm library
+      - Version
+      - Purpose
+      - Usage
+    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
+      - :version-ref:`hipBLAS rocm_version`
+      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
+        matrix and vector operations.
+      - Supports operations such as matrix multiplication, matrix-vector
+        products, and tensor contractions. Utilized in both dense and batched
+        linear algebra operations.
+    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
+      - :version-ref:`hipBLASLt rocm_version`
+      - hipBLASLt is an extension of the hipBLAS library, providing additional
+        features like epilogues fused into the matrix multiplication kernel or
+        use of integer tensor cores.
+      - By setting the flag ``ROCBLAS_USE_HIPBLASLT``, you can dispatch hipblasLt
+        kernels where possible.
+    * - `rocWMMA <https://github.com/ROCm/rocWMMA>`__
+      - :version-ref:`rocWMMA rocm_version`
+      - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
+        multiplication (GEMM) and accumulation operations with mixed precision
+        support.
+      - Can be used to enhance the flash attention performance on AMD compute, by enabling
+        the flag during compile time.
--- a/docs/compatibility/ml-compatibility/ray-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/ray-compatibility.rst
@@ -0,0 +1,111 @@
+:orphan:
+
+.. meta::
+    :description: Ray deep learning framework compatibility
+    :keywords: GPU, Ray compatibility
+
+.. version-set:: rocm_version latest
+
+*******************************************************************************
+Ray compatibility
+*******************************************************************************
+
+Ray is a unified framework for scaling AI and Python applications from your laptop 
+to a full cluster, without changing your code. Ray consists of `a core distributed 
+runtime  <https://docs.ray.io/en/latest/ray-core/walkthrough.html>`_ and a set of 
+`AI libraries <https://docs.ray.io/en/latest/ray-air/getting-started.html>`_ for 
+simplifying machine learning computations.
+
+Ray is a general-purpose framework that runs many types of workloads efficiently. 
+Any Python application can be scaled with Ray, without extra infrastructure.
+
+ROCm support for Ray is upstreamed, and you can build the official source code
+with ROCm support: 
+
+- ROCm support for Ray is hosted in the official `https://github.com/ROCm/ray 
+  <https://github.com/ROCm/ray>`_ repository.
+
+- Due to independent compatibility considerations, this location differs from the 
+  `https://github.com/ray-project/ray <https://github.com/ray-project/ray>`_ upstream repository.
+
+- To install Ray, use the prebuilt :ref:`Docker image <ray-docker-compat>` 
+  which includes ROCm, Ray, and all required dependencies.
+
+  - See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>` 
+    for instructions to get started.
+
+  - See the `Installation section <https://docs.ray.io/en/latest/ray-overview/installation.html>`_ 
+    in the upstream Ray documentation.
+
+  - The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__ 
+    corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
+
+.. note::
+
+  Ray is supported on ROCm 6.4.1.
+
+Supported devices
+================================================================================
+
+**Officially Supported**: AMD Instinct™ MI300X, MI210
+
+
+Use cases and recommendations
+================================================================================
+
+* The `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm 
+  Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__  
+  blog provides an overview of Volcano Engine Reinforcement Learning (verl) 
+  for large language models (LLMs) and discusses its benefits in large-scale 
+  reinforcement learning from human feedback (RLHF). It uses Ray as part of a 
+  hybrid orchestration engine to schedule and coordinate training and inference 
+  tasks in parallel, enabling optimized resource utilization and potential overlap 
+  between these phases. This dynamic resource allocation strategy significantly 
+  improves overall system efficiency. The blog presents verl’s performance results, 
+  focusing on throughput and convergence accuracy achieved on AMD Instinct™ MI300X 
+  GPUs. Follow this guide to get started with verl on AMD Instinct GPUs and 
+  accelerate your RLHF training with ROCm-optimized performance.
+
+* The `Exploring Use Cases for Scalable AI: Implementing Ray with ROCm Support for Efficient ML Workflows 
+  <https://rocm.blogs.amd.com/artificial-intelligence/rocm-ray/README.html>`__
+  blog post describes key use cases such as training and inference for large language models (LLMs), 
+  model serving, hyperparameter tuning, reinforcement learning, and the orchestration of large-scale 
+  workloads using Ray in the ROCm environment.
+
+For more use cases and recommendations, see the AMD GPU tabs in the `Accelerator Support 
+topic <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#accelerator-support>`__ 
+of the Ray core documentation and refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
+where you can search for Ray examples and best practices to optimize your workloads on AMD GPUs.
+
+.. _ray-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
+with ROCm backends on Docker Hub. The following Docker image tags and
+associated inventories represent the latest Ray version from the official Docker Hub and are validated for
+`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
+icon to view the image on Docker Hub.
+
+.. list-table::
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker image
+      - Ray
+      - Pytorch
+      - Ubuntu
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
+      - `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
+      - 2.6.0+git684f6f2
+      - 24.04
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -108,6 +108,8 @@ article_pages = [
    {"file": "compatibility/ml-compatibility/dgl-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/megablocks-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/taichi-compatibility", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
    {"file": "how-to/deep-learning-rocm", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -78,7 +78,11 @@ vllm_benchmark:
        model_repo: Qwen/QwQ-32B
        url: https://huggingface.co/Qwen/QwQ-32B
        precision: float16
-        tunableop: true
+      - model: Qwen3 30B A3B
+        mad_tag: pyt_vllm_qwen3-30b-a3b
+        model_repo: Qwen/Qwen3-30B-A3B
+        url: https://huggingface.co/Qwen/Qwen3-30B-A3B
+        precision: float16
    - group: Microsoft Phi
      tag: phi
      models:
--- a/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
@@ -0,0 +1,72 @@
+dockers:
+  - pull_tag: rocm/jax-training:maxtext-v25.7
+    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
+    components:
+      ROCm: 6.4.1
+      JAX: 0.5.0
+      Python: 3.10.12
+      Transformer Engine: 2.1.0+90d703dd
+      hipBLASLt: 1.x.x
+  - pull_tag: rocm/jax-training:maxtext-v25.7-jax060
+    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
+    components:
+      ROCm: 6.4.1
+      JAX: 0.6.0
+      Python: 3.10.12
+      Transformer Engine: 2.1.0+90d703dd
+      hipBLASLt: 1.1.0-499ece1c21
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: jax_maxtext_train_llama-3.3-70b
+        model_repo: Llama-3.3-70B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.1 8B
+        mad_tag: jax_maxtext_train_llama-3.1-8b
+        model_repo: Llama-3.1-8B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.1 70B
+        mad_tag: jax_maxtext_train_llama-3.1-70b
+        model_repo: Llama-3.1-70B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3 8B
+        mad_tag: jax_maxtext_train_llama-3-8b
+        multinode_training_script: llama3_8b_multinode.sh
+        doc_options: ["multi-node"]
+      - model: Llama 3 70B
+        mad_tag: jax_maxtext_train_llama-3-70b
+        multinode_training_script: llama3_70b_multinode.sh
+        doc_options: ["multi-node"]
+      - model: Llama 2 7B
+        mad_tag: jax_maxtext_train_llama-2-7b
+        model_repo: Llama-2-7B
+        precision: bf16
+        multinode_training_script: llama2_7b_multinode.sh
+        doc_options: ["single-node", "multi-node"]
+      - model: Llama 2 70B
+        mad_tag: jax_maxtext_train_llama-2-70b
+        model_repo: Llama-2-70B
+        precision: bf16
+        multinode_training_script: llama2_70b_multinode.sh
+        doc_options: ["single-node", "multi-node"]
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V2-Lite (16B)
+        mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
+        model_repo: DeepSeek-V2-lite
+        precision: bf16
+        doc_options: ["single-node"]
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: jax_maxtext_train_mixtral-8x7b
+        model_repo: Mixtral-8x7B
+        precision: bf16
+        doc_options: ["single-node"]
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -110,6 +110,28 @@ The table below summarizes information about ROCm-enabled deep learning framewor

          <a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>

+    * - `Ray <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/ray-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#using-a-prebuilt-docker-image-with-ray-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#install-ray-on-bare-metal-or-a-custom-container>`__
+        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#build-your-own-docker-image>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/ray"><i class="fab fa-github fa-lg"></i></a>
+
+    * - `llama.cpp <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/llama-cpp-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#use-a-prebuilt-docker-image-with-llama-cpp-pre-installed>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>
+
 Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
 through the following guides.

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -2,9 +2,9 @@
   :description: How to train a model using JAX MaxText for ROCm.
   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker

-**************************************
-Training a model with MaxText for ROCm
-**************************************
+******************************************
+Training a model with JAX MaxText for ROCm
+******************************************

 MaxText is a high-performance, open-source framework built on the Google JAX
 machine learning library to train LLMs at scale. The MaxText framework for
@@ -12,70 +12,108 @@ ROCm is an optimized fork of the upstream
 `<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
 on AMD MI300X series accelerators.

-The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.5``) image
+The MaxText for ROCm training Docker image
 provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
 including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
 It includes the following software components:

-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.4                          |
-+--------------------------+--------------------------------+
-| JAX                      | 0.4.35                         |
-+--------------------------+--------------------------------+
-| Python                   | 3.10.12                        |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.12.0.dev0+b8b92dc            |
-+--------------------------+--------------------------------+
-| hipBLASLt                | 0.13.0-ae9c477a                |
-+--------------------------+--------------------------------+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml

-Supported features and models
-=============================
+   {% set dockers = data.dockers %}
+   .. tab-set::

-MaxText provides the following key features to train large language models efficiently:
+      {% for docker in dockers %}
+      {% set jax_version = docker.components["JAX"] %}
+
+      .. tab-item:: JAX {{ jax_version }}
+         :sync: {{ docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+
+            {% endfor %}
+         {% if jax_version == "0.6.0" %}
+         .. note::
+
+            Shardy is a new config in JAX 0.6.0. You might get related errors if it's
+            not configured correctly. For now you can turn it off by setting
+            ``shardy=False`` during the training run. You can also follow the `migration
+            guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
+            it.
+
+            The provided multi-node training scripts in this documentation are
+            not currently supported with JAX 0.6.0. For multi-node training, use the JAX 0.5.0
+            Docker image.
+         {% endif %}
+
+      {% endfor %}
+
+MaxText with on ROCm provides the following key features to train large language models efficiently:

 - Transformer Engine (TE)

- Flash Attention (FA) 3
+- Flash Attention (FA) 3 -- with or without sequence input packing

 - GEMM tuning

 - Multi-node support

-.. _amd-maxtext-model-support:
+- NANOO FP8 quantization support

-The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+.. _amd-maxtext-model-support-v257:

-* Llama 3.3 70B
+Supported models
+================

-* Llama 3.1 8B
+The following models are pre-optimized for performance on AMD Instinct MI300
+series accelerators. Some instructions, commands, and available training
+configurations in this documentation might vary by model -- select one to get
+started.

-* Llama 3.1 70B
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml

-* Llama 3 8B
+   {% set model_groups = data.model_groups %}
+   .. raw:: html

-* Llama 3 70B
+         <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+           <div class="row">
+             <div class="col-2 me-2 model-param-head">Model</div>
+             <div class="row col-10">
+      {% for model_group in model_groups %}
+               <div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+             </div>
+           </div>

-* Llama 2 7B
-
-* Llama 2 70B
-
-* DeepSeek-V2-Lite
+           <div class="row mt-1">
+             <div class="col-2 me-2 model-param-head">Model variant</div>
+             <div class="row col-10">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+             </div>
+           </div>
+         </div>

 .. note::

   Some models, such as Llama 3, require an external license agreement through
   a third party (for example, Meta).

-Unsupported features
--------------------
-
-Currently, MaxText's default packed input format is not supported. Using this format
-with the current Docker image results in incorrect attention calculations
-across different input sequences. Support for packed input format is planned for a future release.
-
 System validation
 =================

@@ -98,14 +136,14 @@ This Docker image is optimized for specific model configurations outlined
 as follows. Performance can vary for other training workloads, as AMD
 doesn’t validate configurations and run conditions outside those described.

-.. _amd-maxtext-multi-node-setup:
+.. _amd-maxtext-multi-node-setup-v257:

 Multi-node setup
 ----------------

 For multi-node environments, ensure you have all the necessary packages for
 your network device, such as, RDMA. If you're not using a multi-node setup
-with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
+with RDMA, skip ahead to :ref:`amd-maxtext-get-started-v257`.

 1. Install the following packages to build and install the RDMA driver.

@@ -180,196 +218,203 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
         # If using Mellanox NIC
         export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9

-.. _amd-maxtext-download-docker:
+.. _amd-maxtext-get-started-v257:

-Pull the Docker image
---------------------
+Benchmarking
+============

-1. Use the following command to pull the Docker image from Docker Hub.
+Once the setup is complete, choose between two options to reproduce the
+benchmark results:

-   .. code-block:: shell
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml

-      docker pull rocm/jax-training:maxtext-v25.5
+   .. _vllm-benchmark-mad:

-2. Use the following command to launch the Docker container. Note that the benchmarking scripts
-   used in the :ref:`following section <amd-maxtext-get-started>` automatically launch the Docker container
-   and execute the benchmark.
+   {% set dockers = data.dockers %}
+   {% set model_groups = data.model_groups %}
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}

-   .. code-block:: shell
+   .. container:: model-doc {{model.mad_tag}}

-      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.5
+      .. tab-set::

-.. _amd-maxtext-get-started:
+         {% if model.mad_tag and "single-node" in model.doc_options %}
+         .. tab-item:: MAD-integrated benchmarking

-Getting started
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. Use this command to run the performance benchmark test on the {{ model.model }} model
+               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/perf.csv/``.
+         {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            .. rubric:: Download the Docker image and required scripts
+
+            Run the JAX MaxText benchmark tool independently by starting the
+            Docker container as shown in the following snippet.
+
+            .. tab-set::
+               {% for docker in dockers %}
+               {% set jax_version = docker.components["JAX"] %}
+
+               .. tab-item:: JAX {{ jax_version }}
+                  :sync: {{ docker.pull_tag }}
+
+                  .. code-block:: shell
+
+                     docker pull {{ docker.pull_tag }}
+               {% endfor %}
+
+            {% if model.model_repo and "single-node" in model.doc_options %}
+            .. rubric:: Single node training
+
+            1. Set up environment variables.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN=<Your Hugging Face token>
+                  export HF_HOME=<Location of saved/cached Hugging Face models>
+
+               ``MAD_SECRETS_HFTOKEN`` is your Hugging Face access token to access models, tokenizers, and data.
+               See `User access tokens <https://huggingface.co/docs/hub/en/security-tokens>`__.
+
+               ``HF_HOME`` is where ``huggingface_hub`` will store local data. See `huggingface_hub CLI <https://huggingface.co/docs/huggingface_hub/main/en/guides/cli#huggingface-cli-download>`__.
+               If you already have downloaded or cached Hugging Face artifacts, set this variable to that path.
+               Downloaded files typically get cached to ``~/.cache/huggingface``.
+
+            2. Launch the Docker container.
+
+               .. tab-set::
+                  {% for docker in dockers %}
+                  {% set jax_version = docker.components["JAX"] %}
+
+                  .. tab-item:: JAX {{ jax_version }}
+                     :sync: {{ docker.pull_tag }}
+
+                     .. code-block:: shell
+
+                        docker run -it \
+                            --device=/dev/dri \
+                            --device=/dev/kfd \
+                            --network host \
+                            --ipc host \
+                            --group-add video \
+                            --cap-add=SYS_PTRACE \
+                            --security-opt seccomp=unconfined \
+                            --privileged \
+                            -v $HOME:$HOME \
+                            -v $HOME/.ssh:/root/.ssh \
+                            -v $HF_HOME:/hf_cache \
+                            -e HF_HOME=/hf_cache \
+                            -e MAD_SECRETS_HFTOKEN=$MAD_SECRETS_HFTOKEN
+                            --shm-size 64G \
+                            --name training_env \
+                            {{ docker.pull_tag }}
+                  {% endfor %}
+
+            3. In the Docker container, clone the ROCm MAD repository and navigate to the
+               benchmark scripts directory at ``MAD/scripts/jax-maxtext``.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/jax-maxtext
+
+            4. Run the setup scripts to install libraries and datasets needed
+               for benchmarking.
+
+               .. code-block:: shell
+
+                  ./jax-maxtext_benchmark_setup.sh -m {{ model.model_repo }}
+
+            5. To run the training benchmark without quantization, use the following command:
+
+               .. code-block:: shell
+
+                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }}
+
+               For quantized training, use the following command:
+
+               .. code-block:: shell
+
+                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8
+
+               .. important::
+
+                  Quantized training is not supported with the JAX 0.6.0 Docker image; support
+                  will be added in a future release. For quantized training, use the JAX 0.5.0
+                  Docker image: ``rocm/jax-training:maxtext-v25.7``.
+
+            {% endif %}
+            {% if model.multinode_training_script and "multi-node" in model.doc_options %}
+            .. rubric:: Multi-node training
+
+            The following examples use SLURM to run on multiple nodes.
+
+            .. note::
+
+               The following scripts will launch the Docker container and run the
+               benchmark. Run them outside of any Docker container.
+
+            1. Make sure ``$HF_HOME`` is set before running the test. See
+               `ROCm benchmarking <https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__
+               for more details on downloading the Llama models before running the
+               benchmark.
+
+            2. To run multi-node training for {{ model.model }}, 
+               use the
+               `multi-node training script <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/{{ model.multinode_training_script }}>`__
+               under the ``scripts/jax-maxtext/gpu-rocm/`` directory.
+
+            3. Run the multi-node training benchmark script.
+
+               .. code-block:: shell
+
+                  sbatch -N <num_nodes> {{ model.multinode_training_script }}
+
+         {% else %}
+            .. rubric:: Multi-node training
+
+            For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v257`
+            with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+
+Further reading
 ===============

-The following examples demonstrate how to get started with single node
-and multi-node training using the benchmarking scripts provided at
-`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
+- See the ROCm/maxtext benchmarking README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__.

-.. important::
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.

-   The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

-Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
-set correctly and points to your Hugging Face cache directory. Refer to the
-README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
-for more detailed instructions.
-
-Single node training benchmarking examples
------------------------------------------
-
-* Example 1: Single node training with Llama 2 7B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_7b.sh
-
-* Example 2: Single node training with Llama 2 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_70b.sh
-
-* Example 3: Single node training with Llama 3 8B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_8b.sh
-
-* Example 4: Single node training with Llama 3 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_70b.sh
-
-* Example 5: Single node training with Llama 3.3 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3.3_70b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3.3_70b.sh
-
-* Example 6: Single node training with DeepSeek V2 16B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./deepseek_v2_16b.sh
-
-  .. note::
-
-     The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
-     the tokens/s as a performance indicator.
-
-Multi-node training benchmarking examples
-----------------------------------------
-
-The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
-own cluster setup.
-
-* Example 1: Multi-node training with Llama 2 7B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama2_7b_multinode.sh
-
-* Example 2: Multi-node training with Llama 2 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama2_70b_multinode.sh
-
-* Example 3: Multi-node training with Llama 3 8B model
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama3_8b_multinode.sh
-
-* Example 4: Multi-node training with Llama 3 70B model
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama3_70b_multinode.sh
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

 Previous versions
 =================
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
@@ -17,12 +17,21 @@ previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <http
     - Components
     - Resources

-   * - 25.5 (latest)
+   * - 25.7 (latest)
+     - 
+       * ROCm 6.4.1
+       * JAX 0.6.0, 0.5.0
+     - 
+       * :doc:`Documentation <../jax-maxtext>`
+       * `Docker Hub (JAX 0.6.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7-jax060/images/sha256-7352212ae033a76dca2b9dceffc23c1b5f1a61a7a560082cf747a9bf1acfc9ce>`__
+       * `Docker Hub (JAX 0.5.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025>`__
+
+   * - 25.5
     - 
       * ROCm 6.3.4
       * JAX 0.4.35
     - 
-       * :doc:`Documentation <../jax-maxtext>`
+       * :doc:`Documentation <jax-maxtext-v25.5>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`__

   * - 25.4
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
@@ -51,7 +51,7 @@ MaxText provides the following key features to train large language models effic

 - Multi-node support

-.. _amd-maxtext-model-support:
+.. _amd-maxtext-model-support-v254:

 The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst
@@ -0,0 +1,385 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using JAX MaxText for ROCm.
+   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with MaxText for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm JAX MaxText
+   training performance documentation. See :doc:`../jax-maxtext` for the latest version.
+
+MaxText is a high-performance, open-source framework built on the Google JAX
+machine learning library to train LLMs at scale. The MaxText framework for
+ROCm is an optimized fork of the upstream
+`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
+on AMD MI300X series accelerators.
+
+The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.5``) image
+provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
+including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
+It includes the following software components:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.4                          |
+--------------------------+--------------------------------+
+| JAX                      | 0.4.35                         |
+--------------------------+--------------------------------+
+| Python                   | 3.10.12                        |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.12.0.dev0+b8b92dc            |
+--------------------------+--------------------------------+
+| hipBLASLt                | 0.13.0-ae9c477a                |
+--------------------------+--------------------------------+
+
+Supported features and models
+=============================
+
+MaxText provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- Flash Attention (FA) 3
+
+- GEMM tuning
+
+- Multi-node support
+
+.. _amd-maxtext-model-support-v255:
+
+The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+
+* Llama 3.3 70B
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* Llama 3 8B
+
+* Llama 3 70B
+
+* Llama 2 7B
+
+* Llama 2 70B
+
+* DeepSeek-V2-Lite
+
+.. note::
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+Unsupported features
+--------------------
+
+Currently, MaxText's default packed input format is not supported. Using this format
+with the current Docker image results in incorrect attention calculations
+across different input sequences. Support for packed input format is planned for a future release.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Environment setup
+=================
+
+This Docker image is optimized for specific model configurations outlined
+as follows. Performance can vary for other training workloads, as AMD
+doesn’t validate configurations and run conditions outside those described.
+
+.. _amd-maxtext-multi-node-setup-v255:
+
+Multi-node setup
+----------------
+
+For multi-node environments, ensure you have all the necessary packages for
+your network device, such as, RDMA. If you're not using a multi-node setup
+with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
+
+1. Install the following packages to build and install the RDMA driver.
+
+   .. code-block:: shell
+
+      sudo apt install iproute2 -y
+      sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
+      sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
+
+   Refer to your NIC manufacturer's documentation for further steps on
+   compiling and installing the RoCE driver. For example, for Broadcom,
+   see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
+   in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.
+
+2. Set the following environment variables.
+
+   a. Master address
+
+      Change ``localhost`` to the master node's resolvable hostname or IP address:
+
+      .. code-block:: bash
+
+         export MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+   b. Number of nodes
+
+      Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
+
+      .. code-block:: bash
+
+         export NNODES="${NNODES:-1}"
+
+   c. Node ranks
+
+      Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
+      Node ranks should be unique across all nodes in the cluster.
+
+      .. code-block:: bash
+
+         export NODE_RANK="${NODE_RANK:-0}"
+
+   d. Network interface
+
+      Update the network interface in the script to match your system's network interface. To
+      find your network interface, run the following (outside of any Docker container):
+
+      .. code-block:: bash
+
+         ip a
+
+      Look for an active interface with an IP address in the same subnet as
+      your other nodes. Then, update the following variable in the script, for
+      example:
+
+      .. code-block:: bash
+
+         export NCCL_SOCKET_IFNAME=ens50f0np0
+
+      This variable specifies which network interface to use for inter-node communication.
+      Setting this variable to the incorrect interface can result in communication failures
+      or significantly reduced performance.
+
+   e. RDMA interface
+
+      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
+      Then, set the RDMA interfaces to use for communication.
+
+      .. code-block:: bash
+
+         # If using Broadcom NIC
+         export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+         # If using Mellanox NIC
+         export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
+
+.. _amd-maxtext-download-docker-v255:
+
+Pull the Docker image
+---------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/jax-training:maxtext-v25.5
+
+2. Use the following command to launch the Docker container. Note that the benchmarking scripts
+   used in the :ref:`following section <amd-maxtext-get-started>` automatically launch the Docker container
+   and execute the benchmark.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.5
+
+.. _amd-maxtext-get-started-v255:
+
+Getting started
+===============
+
+The following examples demonstrate how to get started with single node
+and multi-node training using the benchmarking scripts provided at
+`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
+
+.. important::
+
+   The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
+
+Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
+set correctly and points to your Hugging Face cache directory. Refer to the
+README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
+for more detailed instructions.
+
+Single node training benchmarking examples
+------------------------------------------
+
+* Example 1: Single node training with Llama 2 7B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_7b.sh
+
+* Example 2: Single node training with Llama 2 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_70b.sh
+
+* Example 3: Single node training with Llama 3 8B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_8b.sh
+
+* Example 4: Single node training with Llama 3 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_70b.sh
+
+* Example 5: Single node training with Llama 3.3 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3.3_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3.3_70b.sh
+
+* Example 6: Single node training with DeepSeek V2 16B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./deepseek_v2_16b.sh
+
+  .. note::
+
+     The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
+     the tokens/s as a performance indicator.
+
+Multi-node training benchmarking examples
+-----------------------------------------
+
+The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
+own cluster setup.
+
+* Example 1: Multi-node training with Llama 2 7B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama2_7b_multinode.sh
+
+* Example 2: Multi-node training with Llama 2 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama2_70b_multinode.sh
+
+* Example 3: Multi-node training with Llama 3 8B model
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama3_8b_multinode.sh
+
+* Example 4: Multi-node training with Llama 3 70B model
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama3_70b_multinode.sh
+
+Previous versions
+=================
+
+See :doc:`jax-maxtext-history` to find documentation for previous releases
+of the ``ROCm/jax-training`` Docker image.
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -32,19 +32,23 @@ subtrees:
      - file: compatibility/ml-compatibility/pytorch-compatibility.rst
        title: PyTorch compatibility
      - file: compatibility/ml-compatibility/tensorflow-compatibility.rst
-        title: TensorFlow compatibility  
+        title: TensorFlow compatibility
      - file: compatibility/ml-compatibility/jax-compatibility.rst
        title: JAX compatibility
      - file: compatibility/ml-compatibility/verl-compatibility.rst
-        title: verl compatibility  
+        title: verl compatibility
      - file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
        title: Stanford Megatron-LM compatibility
      - file: compatibility/ml-compatibility/dgl-compatibility.rst
-        title: DGL compatibility  
+        title: DGL compatibility
      - file: compatibility/ml-compatibility/megablocks-compatibility.rst
        title: Megablocks compatibility
      - file: compatibility/ml-compatibility/taichi-compatibility.rst
-        title: Taichi compatibility 
+        title: Taichi compatibility
+      - file: compatibility/ml-compatibility/ray-compatibility.rst
+        title: Ray compatibility
+      - file: compatibility/ml-compatibility/llama-cpp-compatibility.rst
+        title: llama.cpp compatibility
  - file: how-to/build-rocm.rst
    title: Build ROCm from source

--- a/tools/rocm-build/default.xml
+++ b/tools/rocm-build/default.xml
@@ -0,0 +1,79 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<manifest>
+    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
+    <default revision="refs/tags/20250912-42"
+     remote="rocm-org"
+     sync-c="true"
+     sync-j="4" />
+<!--list of projects for ROCm-->
+    <project name="aqlprofile" />
+    <project name="ROCR-Runtime" />
+    <project name="amdsmi" />
+    <project name="rdc" />
+    <project name="rocm_bandwidth_test" />
+    <project name="rocm_smi_lib" />
+    <project name="rocm-core" />
+    <project name="rocm-examples" />
+    <project name="rocminfo" />
+    <project name="rocprofiler" />
+    <project name="rocprofiler-register" />
+    <project name="rocprofiler-sdk" />
+    <project name="rocprofiler-compute" />
+    <project name="rocprofiler-systems" />
+    <project name="roctracer" />
+<!--HIP Projects-->
+    <project name="HIP" />
+    <project name="hip-tests" />
+    <project name="HIPIFY" />
+    <project name="clr" />
+    <project name="hipother" />
+<!-- The following projects are all associated with the AMDGPU LLVM compiler -->
+    <project name="half" />
+    <project name="llvm-project" />
+    <project name="spirv-llvm-translator" />
+<!-- gdb projects -->
+    <project name="ROCdbgapi" />
+    <project name="ROCgdb" />
+    <project name="rocr_debug_agent" />
+<!-- ROCm Libraries -->
+    <project groups="mathlibs" name="AMDMIGraphX" />
+    <project groups="mathlibs" name="MIVisionX" />
+    <project groups="mathlibs" name="ROCmValidationSuite" />
+    <project groups="mathlibs" name="composable_kernel" />
+    <project groups="mathlibs" name="hipSOLVER" />
+    <project groups="mathlibs" name="hipTensor" />
+    <project groups="mathlibs" name="hipfort" />
+    <project groups="mathlibs" name="rccl" />
+    <project groups="mathlibs" name="rocAL" />
+    <project groups="mathlibs" name="rocALUTION" />
+    <project groups="mathlibs" name="rocDecode" />
+    <project groups="mathlibs" name="rocJPEG" />
+    <project groups="mathlibs" name="rocm-libraries">
+        <linkfile src="projects/hipcub" dest="hipCUB"/>
+        <linkfile src="projects/rocprim" dest="rocPRIM"/>
+        <linkfile src="projects/hiprand" dest="hipRAND"/>
+        <linkfile src="projects/rocrand" dest="rocRAND"/>
+        <linkfile src="projects/rocthrust" dest="rocThrust"/>
+        <linkfile src="projects/hipblas-common" dest="hipBLAS-common"/>
+        <linkfile src="projects/hipblaslt" dest="hipBLASLt"/>
+        <linkfile src="projects/rocblas" dest="rocBLAS"/>
+        <linkfile src="projects/hipsparselt" dest="hipSPARSELt"/>
+        <linkfile src="projects/rocsparse" dest="rocSPARSE"/>
+        <linkfile src="projects/hipsparse" dest="hipSPARSE"/>
+        <linkfile src="projects/hipblas" dest="hipBLAS"/>
+        <linkfile src="projects/miopen" dest="MIOpen"/>
+        <linkfile src="projects/hipfft" dest="hipFFT"/>
+        <linkfile src="projects/rocfft" dest="rocFFT"/>
+    </project>
+    <project groups="mathlibs" name="rocPyDecode" />
+    <project groups="mathlibs" name="rocSHMEM" />
+    <project groups="mathlibs" name="rocSOLVER" />
+    <project groups="mathlibs" name="rocWMMA" />
+    <project groups="mathlibs" name="rocm-cmake" />
+    <project groups="mathlibs" name="rpp" />
+    <project groups="mathlibs" name="TransferBench" />
+<!-- Projects for OpenMP-Extras -->
+    <project name="aomp" path="openmp-extras/aomp" />
+    <project name="aomp-extras" path="openmp-extras/aomp-extras" />
+    <project name="flang" path="openmp-extras/flang" />
+</manifest>
Author	SHA1	Message	Date
Karthika Rayasam	7dc612421c	adding aqlprofile	2025-09-12 03:29:38 -07:00
Karthika Rayasam	d72c0c3cc2	changing filename	2025-09-12 00:10:51 -07:00
Karthika Rayasam	f909bf3635	changing revision	2025-09-12 00:04:26 -07:00
Karthika Rayasam	7cbd4b2fc6	Adding individual libraries	2025-09-11 06:47:12 -07:00
Karthika Rayasam	9d63c045a1	deleting rocr-kernel-driver	2025-09-11 05:20:51 -07:00
Karthika Rayasam	d55450070c	deleting tensile	2025-09-11 05:16:59 -07:00
Karthika Rayasam	f0dd80e23e	updating revision	2025-09-11 05:13:06 -07:00
Karthika Rayasam	eb3cf0ec1c	add rocm-42.xml file	2025-09-11 03:31:22 -07:00
Karthika Rayasam	6075acfd79	adding rocm-libraries and removing projects under rocm-libraries	2025-09-11 03:25:44 -07:00
Karthika Rayasam	2b85816b32	Adding rocm-libraries and removing the sub projects under rocm-libraries	2025-09-11 03:21:49 -07:00
amd-hsivasun	964a7cd0b5	fixed component name	2025-09-10 17:31:03 -04:00
amd-hsivasun	d3fe7439cf	[Ex CI] enable rocm-smi-lib monorepo	2025-09-10 17:31:03 -04:00
amd-hsivasun	56f566c1dc	[Ex CI] update rocminfo pipeline ID to monorepo	2025-09-10 17:24:17 -04:00
Haresh Sivasuntharampillai	88f1493b68	[Ex CI] enable rocminfo monorepo	2025-09-10 16:30:48 -04:00
anisha-amd	3ca9cb1fcc	Docs: adding ray and llama.cpp live blog links (#5290 )	2025-09-10 15:02:03 -04:00
amd-hsivasun	0840c14b6d	[Ex CI] update rocm-core pipeline ID to monorepo	2025-09-10 11:58:15 -04:00
amd-hsivasun	daa0184d2e	[Ex CI] enable rocm-core monorepo	2025-09-10 11:47:12 -04:00
Pratik Basyal	3b5019e03f	Minor correction (#5285 )	2025-09-10 10:53:25 -04:00
Pratik Basyal	68f505e375	Taichi removed (#5283 )	2025-09-10 10:07:55 -04:00
Peter Park	05a66f75fe	add qwen3 30b a3b to vllm-benchmark-models (#5280 )	2025-09-09 17:41:11 -04:00
Ibrahim Wani	3c37ae88f0	Add origami CI pipelines (#5256 ) * Add origami yaml pipeline. * Unindent lines. * Add cmake dependency step to origami yml. * Add pybind dep * Fix pipeline failures. * Quick fix * Fix pybind11 dep for almalinux * Fix pybind11 dep for almalinux again * Test * [Ex CI] don't create symlink if more than one sparse checkout dir * hipBLASLt multi sparse * Replace pybind with nanobind. * Quick fix * Testing nanobind install in pipelines * Run origami binding tests * Change build path for tests * Change build path for tests again * Add missing dep for CI * Add archs to buildJobs * Fix CI error. * Test * Test job target * Adding job target to hipblaslt dependant builds * Check devices on machine * Add gpu to pipeline * Add more gpu targets * test * Add test job to origami * Update test jobs * Finding test dir * Fix sparse checkout * Find build dir * Try to find build dir * Clean up * Test * Change test dir * Build origami in test job * Try removing job.target from params * Package bindings in build artifacts * Download build as artifact. * Comment out block * Fix checkout in test job * Test1 * Echo to list dir * Sparse checkout origami/python * Download python bindings as artifact * Try ctest instead of running test files directly * Only download artifacts for ubuntu * Add missing cd * Run individual tests not ctest. * Fix hipblaslt build failures * Resolve more ci failures in hipblaslt * Add old changes back in * Fix hipblaslt ci errors * Clean up * Add nanobind to array * Add nanobind to array correctly * Remove nanobind install script * Quick fix * Add pip module installs to test job --------- Co-authored-by: Daniel Su <danielsu@amd.com>	2025-09-09 15:13:54 -06:00
amd-hsivasun	985786e98d	Add sqlalchemy to dependencies in rocprofiler-compute	2025-09-09 15:27:56 -04:00
amd-hsivasun	f25e27acf0	Update roctracer pipeline ID and branch	2025-09-09 14:13:56 -04:00
anisha-amd	db43d18c37	Docs: frameworks compatibility- ray and llama.cpp (#5273 )	2025-09-09 11:02:30 -04:00
Peter Park	4f53183696	docs: Add JAX MaxText benchmark v25.7 (#5182 ) * Update previous versions * Add data file * fix filename and anchors * add templates * update .wordlist.txt * Update template and data add missing step fix fmt * update template * fix data * add jax 0.6.0 * update history * update quantized training note	2025-09-08 21:42:56 -04:00
Joseph Macaranas	94476f34ca	[External CI] Add amdgpu deps to rocpydecode pipeline (#5267 )	2025-09-08 11:32:10 -04:00