correct pip args

add python deps for hipblaslt
Need both target options while transitioning between build systems
2026-01-10 15:18:11 -05:00 · 2025-07-31 08:29:08 -06:00 · 2025-07-30 17:15:13 -06:00 · 2025-07-30 16:11:37 -06:00 · 2025-07-30 13:34:09 -06:00 · 2025-07-30 10:32:52 -04:00
23 changed files with 702 additions and 120 deletions
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -28,8 +28,8 @@ parameters:
 - name: rocmTestDependencies
  type: object
  default:
+    - amdsmi
    - llvm-project
-    - rocm_smi_lib
    - rocprofiler-register

 - name: jobMatrix
@@ -111,14 +111,6 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
-    - task: Bash@3
-      displayName: Install libhwloc5
-      inputs:
-        targetType: 'inline'
-        script: |
-          wget http://ftp.us.debian.org/debian/pool/main/h/hwloc/libhwloc5_1.11.12-3_amd64.deb
-          wget http://ftp.us.debian.org/debian/pool/main/h/hwloc/libhwloc-dev_1.11.12-3_amd64.deb
-          sudo apt install -y --allow-downgrades ./libhwloc5_1.11.12-3_amd64.deb ./libhwloc-dev_1.11.12-3_amd64.deb
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
@@ -161,6 +153,10 @@ jobs:
        targetType: 'inline'
        workingDirectory: $(Build.SourcesDirectory)/rocrtst/suites/test_common
        script: |
+          echo $(Build.SourcesDirectory)/rocrtst/thirdparty/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+          sudo cat /etc/ld.so.conf.d/rocm-ci.conf
+          sudo ldconfig -v
+          ldconfig -p
          if [ -e /opt/rh/gcc-toolset-14/enable ]; then
            source /opt/rh/gcc-toolset-14/enable
          fi
--- a/.azuredevops/components/hipBLAS.yml
+++ b/.azuredevops/components/hipBLAS.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hipBLAS
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -69,10 +88,30 @@ parameters:
        target: gfx942
      - gfx90a:
        target: gfx90a
+# MIOpen depends on both rocRAND and hipBLAS
+# for a unified build, hipBLAS will be the one to call MIOpen
+# - name: downstreamComponentMatrix
+#   type: object
+#   default:
+#     - MIOpen:
+#       name: MIOpen
+#       sparseCheckoutDir: projects/miopen
+#       skipUnifiedBuild: 'false'
+#       buildDependsOn:
+#         - hipBLAS_build
+#       unifiedBuild:
+#         downstreamAggregateNames: hipBLAS+rocRAND
+#         buildDependsOn:
+#           - hipBLAS_build
+#           - rocRAND_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipBLAS_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_ubuntu2204_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -88,6 +127,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aocl.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
@@ -95,6 +135,8 @@ jobs:
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -109,9 +151,12 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -121,46 +166,67 @@ jobs:
        installAOCL: true
        gpuTarget: ${{ job.target }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: hipBLAS_test_${{ job.target }}
-    dependsOn: hipBLAS_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: hipBLAS
-        testExecutable: $(Agent.BuildDirectory)/rocm/bin/hipblas-test
-        testParameters: '--yaml hipblas_smoke.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
-        testDir: '$(Agent.BuildDirectory)/rocm/bin'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testExecutable: $(Agent.BuildDirectory)/rocm/bin/hipblas-test
+          testParameters: '--yaml hipblas_smoke.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
+          testDir: '$(Agent.BuildDirectory)/rocm/bin'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+
+# - ${{ if parameters.triggerDownstreamJobs }}:
+#   - ${{ each component in parameters.downstreamComponentMatrix }}:
+#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+#         parameters:
+#           checkoutRepo: ${{ parameters.checkoutRepo }}
+#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+#           triggerDownstreamJobs: true
+#           unifiedBuild: ${{ parameters.unifiedBuild }}
+#           ${{ if parameters.unifiedBuild }}:
+#             buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
+#             downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
+#           ${{ else }}:
+#             buildDependsOn: ${{ component.buildDependsOn }}
+#             downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -36,8 +36,10 @@ parameters:
    - gfortran
    - git
    - libdrm-dev
+    - liblapack-dev
    - libmsgpack-dev
    - libnuma-dev
+    - libopenblas-dev
    - ninja-build
    - python3-pip
    - python3-venv
@@ -46,6 +48,12 @@ parameters:
  default:
    - joblib
    - "packaging>=22.0"
+    - pyyaml
+    - msgpack
+    - simplejson
+    - ujson
+    - orjson
+    - yappi
    - --upgrade
 - name: rocmDependencies
  type: object
@@ -195,6 +203,7 @@ jobs:
          -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
          -DCMAKE_C_COMPILER_LAUNCHER=ccache
          -DAMDGPU_TARGETS=${{ job.target }}
+          -DGPU_TARGETS=${{ job.target }}
          -DBUILD_CLIENTS_TESTS=ON
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -104,17 +104,17 @@ parameters:
        - rocBLAS_build
    # rocSOLVER depends on both rocBLAS and rocPRIM
    # for a unified build, rocBLAS will be the one to call rocSOLVER
-    # - rocSOLVER:
-    #   name: rocSOLVER
-    #   sparseCheckoutDir: projects/rocsolver
-    #   skipUnifiedBuild: 'false'
-    #   buildDependsOn:
-    #     - rocBLAS_build
-    #   unifiedBuild:
-    #     downstreamAggregateNames: rocBLAS+rocPRIM
-    #     buildDependsOn:
-    #       - rocBLAS_build
-    #       - rocPRIM_build
+    - rocSOLVER:
+      name: rocSOLVER
+      sparseCheckoutDir: projects/rocsolver
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rocBLAS_build
+      unifiedBuild:
+        downstreamAggregateNames: rocBLAS+rocPRIM
+        buildDependsOn:
+          - rocBLAS_build
+          - rocPRIM_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -91,12 +91,12 @@ parameters:
        - rocPRIM_build
    # rocSOLVER depends on both rocBLAS and rocPRIM
    # for a unified build, rocBLAS will be the one to call rocSOLVER
-    # - rocSOLVER:
-    #   name: rocSOLVER
-    #   sparseCheckoutDir: projects/rocsolver
-    #   skipUnifiedBuild: 'true'
-    #   buildDependsOn:
-    #     - rocPRIM_build
+    - rocSOLVER:
+      name: rocSOLVER
+      sparseCheckoutDir: projects/rocsolver
+      skipUnifiedBuild: 'true'
+      buildDependsOn:
+        - rocPRIM_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -83,6 +83,28 @@ parameters:
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - hipBLAS:
+      name: hipBLAS
+      sparseCheckoutDir: projects/hipblas
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rocSOLVER_build
+    # hipSOLVER depends on both rocSOLVER and rocSPARSE
+    # for a unified build, rocSOLVER will be the one to call hipSOLVER
+    # - hipSOLVER:
+    #   name: hipSOLVER
+    #   sparseCheckoutDir: projects/hipsolver
+    #   skipUnifiedBuild: 'false'
+    #   buildDependsOn:
+    #     - rocSOLVER_build
+    #   unifiedBuild:
+    #     downstreamAggregateNames: rocSOLVER+rocSPARSE
+    #     buildDependsOn:
+    #       - rocSOLVER_build
+    #       - rocSPARSE_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
@@ -228,3 +250,19 @@ jobs:
          aptPackages: ${{ parameters.aptPackages }}
          environment: test
          gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
+          ${{ if parameters.unifiedBuild }}:
+            buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
+          ${{ else }}:
+            buildDependsOn: ${{ component.buildDependsOn }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
--- a/.azuredevops/nightly/rocm-nightly.yml
+++ b/.azuredevops/nightly/rocm-nightly.yml
@@ -3,21 +3,21 @@ parameters:
 - name: jobList
  type: object
  default:
-    - { os: ubuntu2204, target: gfx942, source: staging }
-    - { os: ubuntu2204, target: gfx90a, source: staging }
-    - { os: ubuntu2204, target: gfx1201, source: staging }
-    - { os: ubuntu2204, target: gfx1100, source: staging }
-    - { os: ubuntu2204, target: gfx1030, source: staging }
-    - { os: ubuntu2404, target: gfx942, source: staging }
-    - { os: ubuntu2404, target: gfx90a, source: staging }
-    - { os: ubuntu2404, target: gfx1201, source: staging }
-    - { os: ubuntu2404, target: gfx1100, source: staging }
-    - { os: ubuntu2404, target: gfx1030, source: staging }
-    - { os: almalinux8, target: gfx942, source: staging }
-    - { os: almalinux8, target: gfx90a, source: staging }
-    - { os: almalinux8, target: gfx1201, source: staging }
-    - { os: almalinux8, target: gfx1100, source: staging }
-    - { os: almalinux8, target: gfx1030, source: staging }
+    - { os: ubuntu2204, packageManager: apt, target: gfx942, source: staging }
+    - { os: ubuntu2204, packageManager: apt, target: gfx90a, source: staging }
+    - { os: ubuntu2204, packageManager: apt, target: gfx1201, source: staging }
+    - { os: ubuntu2204, packageManager: apt, target: gfx1100, source: staging }
+    - { os: ubuntu2204, packageManager: apt, target: gfx1030, source: staging }
+    - { os: ubuntu2404, packageManager: apt, target: gfx942, source: staging }
+    - { os: ubuntu2404, packageManager: apt, target: gfx90a, source: staging }
+    - { os: ubuntu2404, packageManager: apt, target: gfx1201, source: staging }
+    - { os: ubuntu2404, packageManager: apt, target: gfx1100, source: staging }
+    - { os: ubuntu2404, packageManager: apt, target: gfx1030, source: staging }
+    - { os: almalinux8, packageManager: dnf, target: gfx942, source: staging }
+    - { os: almalinux8, packageManager: dnf, target: gfx90a, source: staging }
+    - { os: almalinux8, packageManager: dnf, target: gfx1201, source: staging }
+    - { os: almalinux8, packageManager: dnf, target: gfx1100, source: staging }
+    - { os: almalinux8, packageManager: dnf, target: gfx1030, source: staging }
 - name: rocmDependencies
  type: object
  default:
@@ -92,7 +92,8 @@ schedules:

 jobs:
 - ${{ each job in parameters.jobList }}:
-  - job: rocm_nightly_${{ job.os }}_${{ job.target }}_${{ job.source }}
+  - job: nightly_${{ job.os }}_${{ job.target }}_${{ job.source }}
+    timeoutInMinutes: 90
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -131,7 +132,7 @@ jobs:
        includeRootFolder: false
        archiveType: tar
        tarCompression: gz
-        archiveFile: $(Build.ArtifactStagingDirectory)/$(Build.DefinitionName)_$(Build.BuildNumber)_ubuntu2204_${{ job.target }}.tar.gz
+        archiveFile: $(Build.ArtifactStagingDirectory)/$(Build.DefinitionName)_$(Build.BuildNumber)_${{ job.os }}_${{ job.target }}.tar.gz
    - script: du -sh $(Build.ArtifactStagingDirectory)
      displayName: Compressed ROCm size
    - task: PublishPipelineArtifact@1
@@ -144,5 +145,95 @@ jobs:
      inputs:
        workingDirectory: $(Pipeline.Workspace)
        targetType: inline
-        script: echo "$(Build.DefinitionName)_$(Build.BuildNumber)_ubuntu2204_${{ job.target }}.tar.gz" >> pipelineArtifacts.txt
+        script: echo "$(Build.DefinitionName)_$(Build.BuildNumber)_${{ job.os }}_${{ job.target }}.tar.gz" >> pipelineArtifacts.txt
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    - ${{ if eq(job.packageManager, 'apt') }}:
+      - task: Bash@3
+        displayName: Create Dockerfile
+        inputs:
+          workingDirectory: $(Agent.BuildDirectory)
+          targetType: inline
+          script: |
+            cat <<'EOF' > Dockerfile
+              ${{ iif(eq(job.os, 'ubuntu2204'), 'FROM ubuntu:22.04', '') }}
+              ${{ iif(eq(job.os, 'ubuntu2404'), 'FROM ubuntu:24.04', '') }}
+
+              WORKDIR /root
+              RUN mkdir rocm
+
+              RUN apt update \
+                && apt upgrade -y \
+                && apt install -y cmake curl git gcc g++ gpg lsb-release lsof ninja-build pkg-config python3 python3-pip wget zip libdrm-dev libelf-dev libgtest-dev libhsakmt-dev libhwloc-dev libnuma-dev libstdc++-12-dev libtbb-dev jq \
+                && apt clean all
+
+              RUN PACKAGE_NAME=$(curl -s https://repo.radeon.com/rocm/apt/latest/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb") \
+                && wget -nv --retry-connrefused https://repo.radeon.com/rocm/apt/latest/pool/main/h/hsa-amd-aqlprofile/$PACKAGE_NAME \
+                && mkdir hsa-amd-aqlprofile \
+                && dpkg-deb -R $PACKAGE_NAME hsa-amd-aqlprofile \
+                && cp -R hsa-amd-aqlprofile/opt/rocm-*/* rocm
+
+              RUN ARTIFACT_URL="https://dev.azure.com/ROCm-CI/ROCm-CI/_apis/build/builds/$(Build.BuildId)/artifacts?artifactName=nightly${{ job.os }}${{ job.target }}${{ job.source }}&api-version=7.1" \
+                && DOWNLOAD_URL=$(curl -s $ARTIFACT_URL | jq ".resource.downloadUrl" | tr -d '"') \
+                && wget -nv --retry-connrefused $DOWNLOAD_URL -O nightly.zip \
+                && unzip nightly.zip \
+                && tar -xf nightly${{ job.os }}${{ job.target }}${{ job.source }}/rocm-nightly*${{ job.os }}*${{ job.target }}*.tar.gz -C rocm
+
+              RUN echo /root/rocm/lib | tee /etc/ld.so.conf.d/rocm-ci.conf
+              RUN echo /root/rocm/llvm/lib | tee -a /etc/ld.so.conf.d/rocm-ci.conf
+              RUN echo /root/rocm/lib64 | tee -a /etc/ld.so.conf.d/rocm-ci.conf
+              RUN echo /root/rocm/llvm/lib64 | tee -a /etc/ld.so.conf.d/rocm-ci.conf
+              RUN ldconfig -v
+              ENV PATH="$PATH:/root/rocm/bin"
+              ENTRYPOINT ["/bin/bash"]
+            EOF
+            cat Dockerfile
+    - ${{ elseif eq(job.packageManager, 'dnf') }}:
+      - task: Bash@3
+        displayName: Create Dockerfile
+        inputs:
+          workingDirectory: $(Agent.BuildDirectory)
+          targetType: inline
+          script: |
+            cat <<'EOF' > Dockerfile
+              ${{ iif(eq(job.os, 'almalinux8'), 'FROM almalinux:8', '') }}
+
+              WORKDIR /root
+              RUN mkdir rocm
+
+              RUN dnf install -y cmake curl git gcc gcc-c++ gnupg2 redhat-lsb-core lsof pkgconf python3 python3-pip wget zip libdrm-devel elfutils-libelf-devel numactl-devel libstdc++-devel tbb-devel jq \
+                && dnf clean all
+
+              RUN PACKAGE_NAME=$(curl -s https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/ | grep -oP "hsa-amd-aqlprofile-[^\"]+\.rpm" | head -n1) \
+                && wget -nv --retry-connrefused https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/$PACKAGE_NAME \
+                && mkdir hsa-amd-aqlprofile \
+                && dnf -y install rpm-build cpio \
+                && rpm2cpio $PACKAGE_NAME | (cd hsa-amd-aqlprofile && cpio -idmv) \
+                && cp -R hsa-amd-aqlprofile/opt/rocm-*/* rocm
+
+              RUN ARTIFACT_URL="https://dev.azure.com/ROCm-CI/ROCm-CI/_apis/build/builds/$(Build.BuildId)/artifacts?artifactName=nightly${{ job.os }}${{ job.target }}${{ job.source }}&api-version=7.1" \
+                && DOWNLOAD_URL=$(curl -s $ARTIFACT_URL | jq ".resource.downloadUrl" | tr -d '"') \
+                && wget -nv --retry-connrefused $DOWNLOAD_URL -O nightly.zip \
+                && UNZIP_DISABLE_ZIPBOMB_DETECTION=TRUE unzip nightly.zip \
+                && tar -xf nightly${{ job.os }}${{ job.target }}${{ job.source }}/rocm-nightly*${{ job.os }}*${{ job.target }}*.tar.gz -C rocm
+
+              RUN echo /root/rocm/lib | tee /etc/ld.so.conf.d/rocm-ci.conf
+              RUN echo /root/rocm/llvm/lib | tee -a /etc/ld.so.conf.d/rocm-ci.conf
+              RUN echo /root/rocm/lib64 | tee -a /etc/ld.so.conf.d/rocm-ci.conf
+              RUN echo /root/rocm/llvm/lib64 | tee -a /etc/ld.so.conf.d/rocm-ci.conf
+              RUN ldconfig -v
+              ENV PATH="$PATH:/root/rocm/bin"
+              ENTRYPOINT ["/bin/bash"]
+            EOF
+            cat Dockerfile
+    - task: Docker@2
+      displayName: Build and upload Docker image
+      inputs:
+        containerRegistry: ContainerService3
+        repository: 'nightly-${{ job.os }}-${{ job.target }}-${{ job.source }}'
+        Dockerfile: '$(Agent.BuildDirectory)/Dockerfile'
+        buildContext: '$(Agent.BuildDirectory)'
+    - task: Bash@3
+      displayName: '!! Docker Run Command !!'
+      inputs:
+        targetType: inline
+        script: echo "docker run -it --network=host --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined rocmexternalcicd.azurecr.io/nightly-${{ job.os }}-${{ job.target }}-${{ job.source }}:$(Build.BuildId)" | tr '[:upper:]' '[:lower:]'
--- a/.azuredevops/templates/steps/dependencies-dnf.yml
+++ b/.azuredevops/templates/steps/dependencies-dnf.yml
@@ -54,11 +54,13 @@ parameters:
    libfftw3-dev: fftw-devel
    libfmt-dev: fmt-devel
    libgmp-dev: gmp-devel
+    liblapack-dev: lapack-devel
    liblzma-dev: xz-devel
    libmpfr-dev: mpfr-devel
    libmsgpack-dev: msgpack-devel
    libncurses5-dev: ncurses-devel
    libnuma-dev: numactl-devel
+    libopenblas-dev: openblas-devel
    libopenmpi-dev: openmpi-devel
    libpci-dev: libpciaccess-devel
    libssl-dev: openssl-devel
--- a/.azuredevops/variables-global.yml
+++ b/.azuredevops/variables-global.yml
@@ -32,13 +32,13 @@ variables:
 - name: GFX90A_TEST_POOL
  value: gfx90a_test_pool
 - name: LATEST_RELEASE_VERSION
-  value: 6.4.1
+  value: 6.4.2
 - name: REPO_RADEON_VERSION
-  value: 6.4.1
+  value: 6.4.2
 - name: NEXT_RELEASE_VERSION
  value: 7.0.0
 - name: LATEST_RELEASE_TAG
-  value: rocm-6.4.1
+  value: rocm-6.4.2
 - name: DOCKER_SKIP_GFX
  value: gfx90a
 - name: AMDMIGRAPHX_PIPELINE_ID
@@ -68,7 +68,7 @@ variables:
 - name: HIPBLAS_COMMON_PIPELINE_ID
  value: 300
 - name: HIPBLAS_PIPELINE_ID
-  value: 87
+  value: 317
 - name: HIPBLASLT_PIPELINE_ID
  value: 301
 - name: HIPCUB_PIPELINE_ID
@@ -84,7 +84,7 @@ variables:
 - name: HIPSOLVER_PIPELINE_ID
  value: 84
 - name: HIPSPARSE_PIPELINE_ID
-  value: 83
+  value: 315
 - name: HIPSPARSELT_PIPELINE_ID
  value: 309
 - name: HIPTENSOR_PIPELINE_ID
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -408,6 +408,7 @@ SDMA
 SDPA
 SDRAM
 SENDMSG
+SGLang
 SGPR
 SGPRs
 SHA
@@ -863,6 +864,7 @@ seealso
 sendmsg
 seqs
 serializers
+sglang
 shader
 sharding
 sigmoid
--- a/docs/compatibility/ml-compatibility/dgl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/dgl-compatibility.rst
@@ -42,16 +42,16 @@ GAT, GCN and GraphSage. Using these we can support a variety of use-cases such a
 - 1D (Temporal) and 2D (Image) Classification
 - Drug Discovery

-Refer to :doc:`ROCm DGL blog posts <https://rocm.blogs.amd.com/blog/tag/dgl.html>` 
-for examples and best practices to optimize your training workflows on AMD GPUs. 
+Multiple use cases of DGL have been tested and verified.
+However, a recommended example follows a drug discovery pipeline using the ``SE3Transformer``.
+Refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_, 
+where you can search for DGL examples and best practices to optimize your training workflows on AMD GPUs. 

 Coverage includes:

 - Single-GPU training/inference
 - Multi-GPU training

-Benchmarking details are included in the :doc:`Benchmarks` section.
-

 .. _dgl-docker-compat:

@@ -252,4 +252,4 @@ Unsupported functions
 * ``gather_mm_idx_b``
 * ``pgexplainer``
 * ``sample_labors_prob``
-* ``sample_labors_noprob``
+* ``sample_labors_noprob``
--- a/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
@@ -1,6 +1,6 @@
 pytorch_inference_benchmark:
  unified_docker:
-    latest: &rocm-pytorch-docker-latest
+    latest:
      pull_tag: rocm/pytorch:latest
      docker_hub_url:
      rocm_version:
@@ -39,3 +39,11 @@ pytorch_inference_benchmark:
        model_repo: Wan-AI/Wan2.1-T2V-14B
        url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
        precision: bfloat16
+    - group: Janus-Pro
+      tag: janus-pro
+      models:
+      - model: Janus Pro 7B
+        mad_tag: pyt_janus_pro_inference
+        model_repo: deepseek-ai/Janus-Pro-7B
+        url: https://huggingface.co/deepseek-ai/Janus-Pro-7B
+        precision: bfloat16
--- a/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
@@ -0,0 +1,17 @@
+sglang_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: lmsysorg/sglang:v0.4.5-rocm630
+      docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
+      rocm_version: 6.3.0
+      sglang_version: 0.4.5 (0.4.5-rocm)
+      pytorch_version: 2.6.0a0+git8d4926e
+  model_groups:
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek-R1-Distill-Qwen-32B
+        mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b
+        model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+        url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+        precision: bfloat16
--- a/docs/data/rocm-software-stack-6_4_0.jpg
+++ b/docs/data/rocm-software-stack-6_4_0.jpg
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history.rst
@@ -0,0 +1,25 @@
+:orphan:
+
+****************************************************
+SGLang inference performance testing version history
+****************************************************
+
+This table lists previous versions of the ROCm SGLang inference performance
+testing environment. For detailed information about available models for
+benchmarking, see the version-specific documentation.
+
+.. list-table::
+   :header-rows: 1
+
+   * - Docker image tag
+     - Components
+     - Resources
+
+   * - ``lmsysorg/sglang:v0.4.5-rocm630``
+     - 
+       * ROCm 6.3.0
+       * SGLang 0.4.5
+       * PyTorch 2.6.0
+     - 
+       * :doc:`Documentation <../sglang>`
+       * `Docker Hub <https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951>`__
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
@@ -103,7 +103,7 @@ PyTorch inference performance testing

         The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.

-   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference
+   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference pyt_janus_pro_inference

      Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`__ from Docker Hub.

@@ -140,22 +140,27 @@ PyTorch inference performance testing
      .. code-block:: shell

         export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-         python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+         madengine run \
+             --tags {{model.mad_tag}} \
+             --keep-model-dir \
+             --live-output \
+             --timeout 28800

      MAD launches a Docker container with the name
      ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
-      model are collected in ``perf.csv``.
+      model are collected in ``perf_{{model.mad_tag}}.csv``.

+      {% if model.mad_tag != "pyt_janus_pro_inference" %}
      .. note::

         For improved performance, consider enabling TunableOp. By default,
         ``{{model.mad_tag}}`` runs with TunableOp disabled (see
         `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable
-         it, edit the default run behavior in the ``tools/run_models.py``-- update the model's
-         run ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+         it, include the ``--tunableop on`` argument in your run.

         Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
         Although this might increase the initial training time, it can result in a performance gain.
+      {% endif %}

      {% endfor %}
   {% endfor %}
@@ -163,8 +168,10 @@ PyTorch inference performance testing
 Further reading
 ===============

+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
 - To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`../../inference-optimization/workload`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
@@ -0,0 +1,280 @@
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and SGLang
+   :keywords: model, MAD, automation, dashboarding, validate
+
+************************************
+SGLang inference performance testing
+************************************
+
+.. _sglang-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
+
+   {% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
+
+   `SGLang <https://docs.sglang.ai>`__ is a high-performance inference and
+   serving engine for large language models (LLMs) and vision models. The
+   ROCm-enabled `SGLang Docker image <{{ unified_docker.docker_hub_url }}>`__
+   bundles SGLang with PyTorch, optimized for AMD Instinct MI300X series
+   accelerators. It includes the following software components:
+
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      * - `ROCm <https://github.com/ROCm/ROCm>`__
+        - {{ unified_docker.rocm_version }}
+
+      * - `SGLang <https://docs.sglang.ai/index.html>`__
+        - {{ unified_docker.sglang_version }} 
+
+      * - `PyTorch <https://github.com/pytorch/pytorch>`__
+        - {{ unified_docker.pytorch_version }} 
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
+
+   {% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
+   {% set model_groups = data.sglang_benchmark.model_groups %}
+
+   Pull the Docker image
+   =====================
+
+   Download the `SGLang Docker image <{{ unified_docker.docker_hub_url }}>`__.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose one of the following methods to benchmark inference performance with
+   `DeepSeek-R1-Distill-Qwen-32B <https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B>`__.
+
+   .. _sglang-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+               using one GPU with the ``{{model.precision}}`` data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/perf_DeepSeek-R1-Distill-Qwen-32B.csv``.
+
+            Although the DeepSeek-R1-Distill-Qwen-32B is preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+         .. tab-item:: Standalone benchmarking
+
+            .. rubric:: Download the Docker image and required scripts
+
+            1. Run the SGLang benchmark script independently by starting the
+               `Docker container <{{ unified_docker.docker_hub_url }}>`__
+               as shown in the following snippet.
+
+               .. code-block:: shell
+
+                  docker pull {{ unified_docker.pull_tag }}
+                  docker run -it \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --shm-size 16G \
+                      --security-opt seccomp=unconfined \
+                      --security-opt apparmor=unconfined \
+                      --cap-add=SYS_PTRACE \
+                      -v $(pwd):/workspace \
+                      --env HUGGINGFACE_HUB_CACHE=/workspace \
+                      --name test \
+                      {{ unified_docker.pull_tag }}
+
+            2. In the Docker container, clone the ROCm MAD repository and navigate to the
+               benchmark scripts directory at ``~/MAD/scripts/sglang``.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/sglang
+
+            3. To start the benchmark, use the following command with the appropriate options.
+
+               .. dropdown:: Benchmark options
+                  :open:
+
+                  .. list-table::
+                     :header-rows: 1
+                     :align: center
+
+                     * - Name
+                       - Options
+                       - Description
+
+                     * - ``$test_option``
+                       - latency
+                       - Measure decoding token latency
+
+                     * -
+                       - throughput
+                       - Measure token generation throughput
+
+                     * -
+                       - all
+                       - Measure both throughput and latency
+
+                     * - ``$num_gpu``
+                       - 8
+                       - Number of GPUs
+
+                     * - ``$datatype``
+                       - ``bfloat16``
+                       - Data type
+
+                     * - ``$dataset``
+                       - random
+                       - Dataset
+
+                  The input sequence length, output sequence length, and tensor parallel (TP) are
+                  already configured. You don't need to specify them with this script.
+
+               Command:
+
+               .. code-block:: shell
+
+                  ./sglang_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d $datatype [-a $dataset]
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block:: shell-session
+
+                  OSError: You are trying to access a gated repo.
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            .. rubric:: Benchmarking examples
+
+            Here are some examples of running the benchmark with various options:
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block:: shell
+
+                 ./sglang_benchmark_report.sh \
+                     -s latency \
+                     -m {{model.model_repo}} \
+                     -g 8 \
+                     -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block:: shell
+
+                 ./sglang_benchmark_report.sh \
+                     -s throughput \
+                     -m {{model.model_repo}} \
+                     -g 8 \
+                     -d {{model.precision}} \
+                     -a random
+
+              Find the throughput report at ``./reports_{{model.precision}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/sgl-project/sglang/tree/main/benchmark/blog_v0_2>`__.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__.
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`previous-versions/sglang-history` to find documentation for previous releases
+of SGLang inference performance testing.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -202,7 +202,7 @@ system's configuration.
               .. code-block:: shell

                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-                  python3 tools/run_models.py \
+                  madengine run \
                      --tags {{model.mad_tag}} \
                      --keep-model-dir \
                      --live-output \
@@ -226,12 +226,12 @@ system's configuration.

               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
               (see
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
-               enable it, edit the default run behavior in the ``models.json``
-               configuration before running inference -- update the model's run
-               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
+               To enable it, include the ``--tunableop on`` argument in your
+               run.

-               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed
+               by the performance-collection run.

            {% endif %}

@@ -419,8 +419,10 @@ Further reading
 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
 - To learn more about system settings and management practices to configure your system for
-  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
--- a/docs/how-to/rocm-for-ai/inference/index.rst
+++ b/docs/how-to/rocm-for-ai/inference/index.rst
@@ -24,4 +24,6 @@ training, fine-tuning, and inference. It leverages popular machine learning fram

 - :doc:`PyTorch inference performance testing <benchmark-docker/pytorch-inference>`

+- :doc:`SGLang inference performance testing <benchmark-docker/sglang>`
+
 - :doc:`Deploying your model <deploy-your-model>`
--- a/docs/how-to/rocm-for-ai/install.rst
+++ b/docs/how-to/rocm-for-ai/install.rst
@@ -24,12 +24,13 @@ If you’re new to ROCm, refer to the :doc:`ROCm quick start install guide for L
 If you’re using a Radeon GPU for graphics-accelerated applications, refer to the
 `Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/docs-6.1.3/docs/install/native_linux/install-radeon.html>`_.

-ROCm supports multiple :doc:`installation methods <rocm-install-on-linux:install/install-overview>`:
+You can install ROCm on :ref:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
+distribution's package manager. See the following documentation resources to get started:
+
+* :doc:`ROCm installation overview <rocm-install-on-linux:install/install-overview>`

 * :doc:`Using your Linux distribution's package manager <rocm-install-on-linux:install/install-methods/package-manager-index>`

-* :doc:`Using the AMDGPU installer <rocm-install-on-linux:install/install-methods/amdgpu-installer-index>`
-
 * :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`

 .. grid:: 1
@@ -59,6 +60,12 @@ images with the framework pre-installed.

 * :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`

+* :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
+
+* :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
+
+* :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
+
 Next steps
 ==========

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.rst
@@ -73,7 +73,11 @@ document are not validated.

      .. code-block:: shell

-         python3 tools/run_models.py --tags pyt_mpt30b_training --keep-model-dir --live-output --clean-docker-cache
+         madengine run \
+             --tags pyt_mpt30b_training \
+             --keep-model-dir \
+             --live-output \
+             --clean-docker-cache

      .. tip::

@@ -90,7 +94,7 @@ document are not validated.

         For improved performance (training throughput), consider enabling TunableOp.
         By default, ``pyt_mpt30b_training`` runs with TunableOp disabled. To enable it,
-         run ``tools/run_models.py`` with the ``--tunableop on`` argument or edit the
+         run ``madengine run`` with the ``--tunableop on`` argument or edit the
         ``models.json`` configuration before running training.

         Although this might increase the initial training time, it can result in a performance gain.
@@ -172,4 +176,13 @@ Key performance metrics include:

    Overall training loss. A decreasing trend indicates the model is learning effectively.

+Further reading
+===============

+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -142,7 +142,11 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
            .. code-block:: shell

               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-               python3 tools/run_models.py --tags {{ model.mad_tag }} --keep-model-dir --live-output --timeout 28800
+               madengine run \
+                   --tags {{ model.mad_tag }} \
+                   --keep-model-dir \
+                   --live-output \
+                   --timeout 28800

            MAD launches a Docker container with the name
            ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
@@ -427,6 +431,17 @@ The following models are pre-optimized for performance on the AMD Instinct MI325

               For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.

+Further reading
+===============
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
 Previous versions
 =================

--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -19,9 +19,9 @@ subtrees:

 - caption: Install
  entries:
-  - url: https://rocm.docs.amd.com/projects/install-on-linux/en/latest/
+  - url: https://rocm.docs.amd.com/projects/install-on-linux/en/${branch}/
    title: ROCm on Linux
-  - url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/
+  - url: https://rocm.docs.amd.com/projects/install-on-windows/en/latest/
    title: HIP SDK on Windows
  - url: https://rocm.docs.amd.com/projects/radeon/en/latest/index.html
    title: ROCm on Radeon GPUs
@@ -82,6 +82,8 @@ subtrees:
            title: vLLM inference performance testing
          - file: how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
            title: PyTorch inference performance testing
+          - file: how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
+            title: SGLang inference performance testing
          - file: how-to/rocm-for-ai/inference/deploy-your-model.rst
            title: Deploy your model
Author	SHA1	Message	Date
David Dixon	bfe3983d90	correct pip args	2025-07-31 08:29:08 -06:00
David Dixon	98f51f2bcf	add python deps for hipblaslt	2025-07-30 17:15:13 -06:00
David Dixon	a38ab1c212	Need both target options while transitioning between build systems	2025-07-30 16:11:37 -06:00
David Dixon	54c86f3a92	add deps install back	2025-07-30 13:34:09 -06:00
Daniel Su	014442b28d	Change to GPU_TARGETS	2025-07-30 10:32:52 -04:00
Daniel Su	f3f1526fee	Add blas and lapack to dnf map	2025-07-30 10:30:00 -04:00
David Dixon	d74d0543e8	Drop lapack install script	2025-07-29 18:47:25 -06:00
Pratik Basyal	f632f2879f	ROCm Software Stack image for 6.4.0 updated (#5112 )	2025-07-28 14:51:19 -04:00
yugang-amd	cc5bc5a882	Add SGLang inference benchmark doc w/ initial support for DeepSeek-R1-Distill-Qwen-32B (#4870 )	2025-07-25 12:42:40 -04:00
Daniel Su	2c9c3d0ba1	[Ex CI] switch hipBLAS/SPARSE pipeline IDs to monorepo (#5098 )	2025-07-24 16:53:29 -04:00
Peter Park	14249f24d8	Use `madengine` instead of tools/run_models.py in docs (#5095 )	2025-07-24 15:38:12 -04:00
Daniel Su	0e8045cca7	[Ex CI] enable hipBLAS monorepo (#5090 )	2025-07-24 12:37:34 -04:00
Daniel Su	541fe92947	[Ex CI] update to 6.4.2 (#5087 )	2025-07-23 14:10:40 -04:00
Daniel Su	628d5f8a19	[Ex CI] create Docker images for nightly builds (#5005 )	2025-07-23 12:16:11 -04:00
Peter Park	984a91f008	Add DeepSeek Janus Pro 7B to PyTorch inference benchmark doc (#5071 ) --------- Co-authored-by: yugang-amd <yugang.wang@amd.com>	2025-07-22 16:26:06 -04:00
amd-hsivasun	ae2cc6ab38	[EX CI] ROCR-Runtime: migrate from rocm-smi to amd-smi (#5088 ) * Update ROCR-Runtime.yml Migrate from rocmsmi to amdsmi * Update ROCR-Runtime.yml Removed libhwloc.so.5 install * Update ROCR-Runtime.yml Link to hwloc.so.5 * Update ROCR-Runtime.yml Added link in the rocrtst step * Update ROCR-Runtime.yml	2025-07-22 14:17:53 -04:00
Peter Park	15ee605d18	Fix branches for install docs in _toc.yml.in (#5083 )	2025-07-22 11:03:40 -04:00
anisha-amd	ae54add299	Sphinx warning for ROCm fixed (#5077 ) (#5082 ) * Sphinx warning for DGL fixed * Update dgl-compatibility.rst removed benchmark line and updated link --------- Co-authored-by: Pratik Basyal <prbasyal@amd.com>	2025-07-22 10:51:15 -04:00
Peter Park	2269e9d25d	Remove broken link to deprecated AMDGPU installer documentation (#5078 ) * remove link to deprecated AMDGPU installation method * add deep learning frameworks	2025-07-21 19:36:20 -04:00