Compare commits

..

26 Commits

Author SHA1 Message Date
Karthika Rayasam
7dc612421c adding aqlprofile 2025-09-12 03:29:38 -07:00
Karthika Rayasam
d72c0c3cc2 changing filename 2025-09-12 00:10:51 -07:00
Karthika Rayasam
f909bf3635 changing revision 2025-09-12 00:04:26 -07:00
Karthika Rayasam
7cbd4b2fc6 Adding individual libraries 2025-09-11 06:47:12 -07:00
Karthika Rayasam
9d63c045a1 deleting rocr-kernel-driver 2025-09-11 05:20:51 -07:00
Karthika Rayasam
d55450070c deleting tensile 2025-09-11 05:16:59 -07:00
Karthika Rayasam
f0dd80e23e updating revision 2025-09-11 05:13:06 -07:00
Karthika Rayasam
eb3cf0ec1c add rocm-42.xml file 2025-09-11 03:31:22 -07:00
Karthika Rayasam
6075acfd79 adding rocm-libraries and removing projects under rocm-libraries 2025-09-11 03:25:44 -07:00
Karthika Rayasam
2b85816b32 Adding rocm-libraries and removing the sub projects under rocm-libraries 2025-09-11 03:21:49 -07:00
amd-hsivasun
964a7cd0b5 fixed component name 2025-09-10 17:31:03 -04:00
amd-hsivasun
d3fe7439cf [Ex CI] enable rocm-smi-lib monorepo 2025-09-10 17:31:03 -04:00
amd-hsivasun
56f566c1dc [Ex CI] update rocminfo pipeline ID to monorepo 2025-09-10 17:24:17 -04:00
Haresh Sivasuntharampillai
88f1493b68 [Ex CI] enable rocminfo monorepo 2025-09-10 16:30:48 -04:00
anisha-amd
3ca9cb1fcc Docs: adding ray and llama.cpp live blog links (#5290) 2025-09-10 15:02:03 -04:00
amd-hsivasun
0840c14b6d [Ex CI] update rocm-core pipeline ID to monorepo 2025-09-10 11:58:15 -04:00
amd-hsivasun
daa0184d2e [Ex CI] enable rocm-core monorepo 2025-09-10 11:47:12 -04:00
Pratik Basyal
3b5019e03f Minor correction (#5285) 2025-09-10 10:53:25 -04:00
Pratik Basyal
68f505e375 Taichi removed (#5283) 2025-09-10 10:07:55 -04:00
Peter Park
05a66f75fe add qwen3 30b a3b to vllm-benchmark-models (#5280) 2025-09-09 17:41:11 -04:00
Ibrahim Wani
3c37ae88f0 Add origami CI pipelines (#5256)
* Add origami yaml pipeline.

* Unindent lines.

* Add cmake dependency step to origami yml.

* Add pybind dep

* Fix pipeline failures.

* Quick fix

* Fix pybind11 dep for almalinux

* Fix pybind11 dep for almalinux again

* Test

* [Ex CI] don't create symlink if more than one sparse checkout dir

* hipBLASLt multi sparse

* Replace pybind with nanobind.

* Quick fix

* Testing nanobind install in pipelines

* Run origami binding tests

* Change build path for tests

* Change build path for tests again

* Add missing dep for CI

* Add archs to buildJobs

* Fix CI error.

* Test

* Test job target

* Adding job target to hipblaslt dependant builds

* Check devices on machine

* Add gpu to pipeline

* Add more gpu targets

* test

* Add test job to origami

* Update test jobs

* Finding test dir

* Fix sparse checkout

* Find build dir

* Try to find build dir

* Clean up

* Test

* Change test dir

* Build origami in test job

* Try removing job.target from params

* Package bindings in build artifacts

* Download build as artifact.

* Comment out block

* Fix checkout in test job

* Test1

* Echo to list dir

* Sparse checkout origami/python

* Download python bindings as artifact

* Try ctest instead of running test files directly

* Only download artifacts for ubuntu

* Add missing cd

* Run individual tests not ctest.

* Fix hipblaslt build failures

* Resolve more ci failures in hipblaslt

* Add old changes back in

* Fix hipblaslt ci errors

* Clean up

* Add nanobind to array

* Add nanobind to array correctly

* Remove nanobind install script

* Quick fix

* Add pip module installs to test job

---------

Co-authored-by: Daniel Su <danielsu@amd.com>
2025-09-09 15:13:54 -06:00
amd-hsivasun
985786e98d Add sqlalchemy to dependencies in rocprofiler-compute 2025-09-09 15:27:56 -04:00
amd-hsivasun
f25e27acf0 Update roctracer pipeline ID and branch 2025-09-09 14:13:56 -04:00
anisha-amd
db43d18c37 Docs: frameworks compatibility- ray and llama.cpp (#5273) 2025-09-09 11:02:30 -04:00
Peter Park
4f53183696 docs: Add JAX MaxText benchmark v25.7 (#5182)
* Update previous versions

* Add data file

* fix filename and anchors

* add templates

* update .wordlist.txt

* Update template and data

add missing step

fix fmt

* update template

* fix data

* add jax 0.6.0

* update history

* update quantized training note
2025-09-08 21:42:56 -04:00
Joseph Macaranas
94476f34ca [External CI] Add amdgpu deps to rocpydecode pipeline (#5267) 2025-09-08 11:32:10 -04:00
25 changed files with 1580 additions and 350 deletions

View File

@@ -178,7 +178,7 @@ jobs:
mkdir -p $(Agent.BuildDirectory)/temp-deps
cd $(Agent.BuildDirectory)/temp-deps
# position-independent LAPACK is required for almalinux8 builds
cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/sparse/projects/hipblaslt/deps
make -j
sudo make install
- script: |
@@ -197,6 +197,8 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
os: ${{ job.os }}
cmakeSourceDir: $(Agent.BuildDirectory)/sparse/projects/hipblaslt
cmakeBuildDir: $(Agent.BuildDirectory)/sparse/projects/hipblaslt/build
extraBuildFlags: >-
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
-DCMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include

View File

@@ -0,0 +1,236 @@
parameters:
- name: componentName
type: string
default: origami
- name: checkoutRepo
type: string
default: 'self'
- name: checkoutRef
type: string
default: ''
# monorepo related parameters
- name: sparseCheckoutDir
type: string
default: ''
- name: triggerDownstreamJobs
type: boolean
default: false
- name: downstreamAggregateNames
type: string
default: ''
- name: buildDependsOn
type: object
default: null
- name: unifiedBuild
type: boolean
default: false
# set to true if doing full build of ROCm stack
# and dependencies are pulled from same pipeline
- name: aggregatePipeline
type: boolean
default: false
- name: aptPackages
type: object
default:
- cmake
- git
- ninja-build
- wget
- python3
- python3-dev
- python3-pip
- name: pipModules
type: object
default:
- nanobind>=2.0.0
- name: rocmDependencies
type: object
default:
- clr
- llvm-project
- rocm-cmake
- rocminfo
- ROCR-Runtime
- rocprofiler-register
- name: rocmTestDependencies
type: object
default:
- clr
- llvm-project
- rocm-cmake
- rocminfo
- ROCR-Runtime
- rocprofiler-register
- name: jobMatrix
type: object
default:
buildJobs:
- { os: ubuntu2204, packageManager: apt }
- { os: almalinux8, packageManager: dnf }
testJobs:
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
- name: downstreamComponentMatrix
type: object
default:
- hipBLASLt:
name: hipBLASLt
sparseCheckoutDir: projects/hipblaslt
skipUnifiedBuild: 'false'
buildDependsOn:
- origami_build
jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
- job: origami_build_${{ job.os }}
${{ if parameters.buildDependsOn }}:
dependsOn:
- ${{ each build in parameters.buildDependsOn }}:
- ${{ build }}_${{ job.os }}
variables:
- group: common
- template: /.azuredevops/variables-global.yml
- name: ROCM_PATH
value: $(Agent.BuildDirectory)/rocm
pool:
vmImage: ${{ variables.BASE_BUILD_POOL }}
${{ if eq(job.os, 'almalinux8') }}:
container:
image: rocmexternalcicd.azurecr.io/manylinux228:latest
endpoint: ContainerService3
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
packageManager: ${{ job.packageManager }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmDependencies }}
os: ${{ job.os }}
aggregatePipeline: ${{ parameters.aggregatePipeline }}
${{ if parameters.triggerDownstreamJobs }}:
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
os: ${{ job.os }}
extraBuildFlags: >-
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-DORIGAMI_BUILD_SHARED_LIBS=ON
-DORIGAMI_ENABLE_PYTHON=ON
-DORIGAMI_BUILD_TESTING=ON
-GNinja
- ${{ if ne(job.os, 'almalinux8') }}:
- task: PublishPipelineArtifact@1
displayName: 'Publish Build Directory Artifact'
inputs:
targetPath: '$(Agent.BuildDirectory)/s/build'
artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
publishLocation: 'pipeline'
- task: PublishPipelineArtifact@1
displayName: 'Publish Python Source Artifact'
inputs:
targetPath: '$(Agent.BuildDirectory)/s/python'
artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
publishLocation: 'pipeline'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
parameters:
componentName: ${{ parameters.componentName }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
os: ${{ job.os }}
componentName: ${{ parameters.componentName }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
- ${{ if eq(parameters.unifiedBuild, False) }}:
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: origami_test_${{ job.os }}_${{ job.target }}
timeoutInMinutes: 120
dependsOn: origami_build_${{ job.os }}
condition:
and(succeeded(),
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
eq(${{ parameters.aggregatePipeline }}, False)
)
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ job.target }}_test_pool
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
packageManager: ${{ job.packageManager }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
parameters:
preTargetFilter: ${{ parameters.componentName }}
os: ${{ job.os }}
- task: DownloadPipelineArtifact@2
displayName: 'Download Build Directory Artifact'
inputs:
artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
path: '$(Agent.BuildDirectory)/s/build'
- task: DownloadPipelineArtifact@2
displayName: 'Download Python Source Artifact'
inputs:
artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
path: '$(Agent.BuildDirectory)/s/python'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmTestDependencies }}
os: ${{ job.os }}
gpuTarget: ${{ job.target }}
${{ if parameters.triggerDownstreamJobs }}:
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
- script: |
export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
echo "--- Running origami_test.py ---"
python3 $(Agent.BuildDirectory)/s/python/origami_test.py
echo "--- Running origami_grid_test.py ---"
python3 $(Agent.BuildDirectory)/s/python/origami_grid_test.py
displayName: 'Run Python Binding Tests'
condition: succeeded()
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
environment: test
gpuTarget: ${{ job.target }}
- ${{ if parameters.triggerDownstreamJobs }}:
- ${{ each component in parameters.downstreamComponentMatrix }}:
- ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
- template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
buildDependsOn: ${{ component.buildDependsOn }}
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
triggerDownstreamJobs: true
unifiedBuild: ${{ parameters.unifiedBuild }}

View File

@@ -8,6 +8,9 @@ parameters:
- name: checkoutRef
type: string
default: ''
- name: rocPyDecodeRepo
type: string
default: rocpydecode_repo
# monorepo related parameters
- name: sparseCheckoutDir
type: string
@@ -207,7 +210,7 @@ jobs:
- ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
- template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
checkoutRepo: ${{ parameters.rocPyDecodeRepo }}
sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
buildDependsOn: ${{ component.buildDependsOn }}
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}

View File

@@ -1,10 +1,29 @@
parameters:
- name: componentName
type: string
default: rocm-core
- name: checkoutRepo
type: string
default: 'self'
- name: checkoutRef
type: string
default: ''
# monorepo related parameters
- name: sparseCheckoutDir
type: string
default: ''
- name: triggerDownstreamJobs
type: boolean
default: false
- name: downstreamAggregateNames
type: string
default: ''
- name: buildDependsOn
type: object
default: null
- name: unifiedBuild
type: boolean
default: false
# set to true if doing full build of ROCm stack
# and dependencies are pulled from same pipeline
- name: aggregatePipeline
@@ -27,6 +46,10 @@ parameters:
jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
- job: rocm_core_${{ job.os }}
${{ if parameters.buildDependsOn }}:
dependsOn:
- ${{ each build in parameters.buildDependsOn }}:
- ${{ build }}_${{ job.os }}
pool:
${{ if eq(job.os, 'ubuntu2404') }}:
vmImage: 'ubuntu-24.04'
@@ -50,8 +73,10 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
componentName: ${{ parameters.componentName }}
os: ${{ job.os }}
useAmdclang: false
extraBuildFlags: >-
@@ -65,9 +90,12 @@ jobs:
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
parameters:
componentName: ${{ parameters.componentName }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
componentName: ${{ parameters.componentName }}
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
# - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml

View File

@@ -1,10 +1,29 @@
parameters:
- name: componentName
type: string
default: rocm-smi-lib
- name: checkoutRepo
type: string
default: 'self'
- name: checkoutRef
type: string
default: ''
# monorepo related parameters
- name: sparseCheckoutDir
type: string
default: ''
- name: triggerDownstreamJobs
type: boolean
default: false
- name: downstreamAggregateNames
type: string
default: ''
- name: buildDependsOn
type: object
default: null
- name: unifiedBuild
type: boolean
default: false
# set to true if doing full build of ROCm stack
# and dependencies are pulled from same pipeline
- name: aggregatePipeline
@@ -32,6 +51,10 @@ parameters:
jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
- job: rocm_smi_lib_build_${{ job.os }}
${{ if parameters.buildDependsOn }}:
dependsOn:
- ${{ each build in parameters.buildDependsOn }}:
- ${{ build }}_${{ job.os }}
pool:
${{ if eq(job.os, 'ubuntu2404') }}:
vmImage: 'ubuntu-24.04'
@@ -55,8 +78,10 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
componentName: ${{ parameters.componentName }}
os: ${{ job.os }}
useAmdclang: false
extraBuildFlags: >-
@@ -65,51 +90,56 @@ jobs:
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
parameters:
componentName: ${{ parameters.componentName }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
componentName: ${{ parameters.componentName }}
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
# - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
# parameters:
# aptPackages: ${{ parameters.aptPackages }}
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: rocm_smi_lib_test_${{ job.os }}_${{ job.target }}
dependsOn: rocm_smi_lib_build_${{ job.os }}
condition:
and(succeeded(),
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
eq(${{ parameters.aggregatePipeline }}, False)
)
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ job.target }}_test_pool
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
packageManager: ${{ job.packageManager }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
parameters:
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
parameters:
runRocminfo: false
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: rocm_smi_lib
testDir: '$(Agent.BuildDirectory)'
testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
environment: test
gpuTarget: ${{ job.target }}
- ${{ if eq(parameters.unifiedBuild, False) }}:
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: rocm_smi_lib_test_${{ job.os }}_${{ job.target }}
dependsOn: rocm_smi_lib_build_${{ job.os }}
condition:
and(succeeded(),
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
eq(${{ parameters.aggregatePipeline }}, False)
)
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ job.target }}_test_pool
workspace:
clean: all
steps:
- checkout: none
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
packageManager: ${{ job.packageManager }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
parameters:
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
parameters:
runRocminfo: false
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: ${{ parameters.componentName }}
testDir: '$(Agent.BuildDirectory)'
testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
environment: test
gpuTarget: ${{ job.target }}

View File

@@ -1,10 +1,29 @@
parameters:
- name: componentName
type: string
default: rocminfo
- name: checkoutRepo
type: string
default: 'self'
- name: checkoutRef
type: string
default: ''
# monorepo related parameters
- name: sparseCheckoutDir
type: string
default: ''
- name: triggerDownstreamJobs
type: boolean
default: false
- name: downstreamAggregateNames
type: string
default: ''
- name: buildDependsOn
type: object
default: null
- name: unifiedBuild
type: boolean
default: false
# set to true if doing full build of ROCm stack
# and dependencies are pulled from same pipeline
- name: aggregatePipeline
@@ -40,7 +59,11 @@ parameters:
jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
- job: rocminfo_build_${{ job.os }}
- job: ${{ parameters.componentName }}_build_${{ job.os }}
${{ if parameters.buildDependsOn }}:
dependsOn:
- ${{ each build in parameters.buildDependsOn }}:
- ${{ build }}_${{ job.os }}
pool:
vmImage: 'ubuntu-22.04'
${{ if eq(job.os, 'almalinux8') }}:
@@ -62,14 +85,18 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmDependencies }}
aggregatePipeline: ${{ parameters.aggregatePipeline }}
os: ${{ job.os }}
${{ if parameters.triggerDownstreamJobs }}:
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
componentName: ${{ parameters.componentName }}
os: ${{ job.os }}
useAmdclang: false
extraBuildFlags: >-
@@ -78,65 +105,71 @@ jobs:
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
parameters:
componentName: ${{ parameters.componentName }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
componentName: ${{ parameters.componentName }}
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: rocminfo_test_${{ job.target }}
dependsOn: rocminfo_build_${{ job.os }}
condition:
and(succeeded(),
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
eq(${{ parameters.aggregatePipeline }}, False)
)
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ job.target }}_test_pool
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
packageManager: ${{ job.packageManager }}
registerROCmPackages: true
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
parameters:
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmTestDependencies }}
gpuTarget: ${{ job.target }}
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
parameters:
runRocminfo: false
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: rocminfo
testDir: '$(Agent.BuildDirectory)'
testExecutable: './rocm/bin/rocminfo'
testParameters: ''
testPublishResults: false
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: rocm_agent_enumerator
testDir: '$(Agent.BuildDirectory)'
testExecutable: './rocm/bin/rocm_agent_enumerator'
testParameters: ''
testPublishResults: false
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
registerROCmPackages: true
environment: test
gpuTarget: ${{ job.target }}
- ${{ if eq(parameters.unifiedBuild, False) }}:
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: rocminfo_test_${{ job.target }}
dependsOn: rocminfo_build_${{ job.os }}
condition:
and(succeeded(),
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
eq(${{ parameters.aggregatePipeline }}, False)
)
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ job.target }}_test_pool
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
packageManager: ${{ job.packageManager }}
registerROCmPackages: true
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
parameters:
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmTestDependencies }}
gpuTarget: ${{ job.target }}
os: ${{ job.os }}
${{ if parameters.triggerDownstreamJobs }}:
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
parameters:
runRocminfo: false
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: ${{ parameters.componentName }}
testDir: '$(Agent.BuildDirectory)'
testExecutable: './rocm/bin/rocminfo'
testParameters: ''
testPublishResults: false
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: rocm_agent_enumerator
testDir: '$(Agent.BuildDirectory)'
testExecutable: './rocm/bin/rocm_agent_enumerator'
testParameters: ''
testPublishResults: false
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
registerROCmPackages: true
environment: test
gpuTarget: ${{ job.target }}

View File

@@ -55,6 +55,7 @@ parameters:
- pymongo
- pyyaml
- setuptools
- sqlalchemy
- tabulate
- textual
- textual_plotext

View File

@@ -171,16 +171,16 @@ parameters:
developBranch: develop
hasGpuTarget: false
rocm-core:
pipelineId: 103
developBranch: master
pipelineId: 349
developBranch: develop
hasGpuTarget: false
rocm-examples:
pipelineId: 216
developBranch: amd-staging
hasGpuTarget: true
rocminfo:
pipelineId: 91
developBranch: amd-staging
pipelineId: 356
developBranch: develop
hasGpuTarget: false
rocMLIR:
pipelineId: 229
@@ -251,8 +251,8 @@ parameters:
developBranch: develop
hasGpuTarget: true
roctracer:
pipelineId: 141
developBranch: amd-staging
pipelineId: 331
developBranch: develop
hasGpuTarget: true
rocWMMA:
pipelineId: 109

View File

@@ -156,6 +156,7 @@ GEMMs
GFLOPS
GFortran
GFXIP
GGUF
Gemma
GiB
GIM
@@ -293,6 +294,7 @@ Multicore
Multithreaded
MyEnvironment
MyST
NANOO
NBIO
NBIOs
NCCL
@@ -500,6 +502,7 @@ Unhandled
VALU
VBIOS
VCN
verl's
VGPR
VGPRs
VM
@@ -742,6 +745,7 @@ logits
lossy
macOS
matchers
maxtext
megatron
microarchitecture
migraphx

View File

@@ -57,9 +57,8 @@ ROCm documentation continues to be updated to provide clearer and more comprehen
For more information about the changes, see [Changelog for the AI Developer Hub](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/changelog.html).
* ROCm provides a comprehensive ecosystem for deep learning development. For more details, see [Deep learning frameworks for ROCm](https://rocm.docs.amd.com/en/docs-6.4.3/how-to/deep-learning-rocm.html). AMD ROCm adds support for the following deep learning frameworks:
* ROCm provides a comprehensive ecosystem for deep learning development. For more details, see [Deep learning frameworks for ROCm](https://rocm.docs.amd.com/en/docs-6.4.3/how-to/deep-learning-rocm.html). AMD ROCm adds support for the following deep learning framework:
* Taichi is an open-source, imperative, and parallel programming language designed for high-performance numerical computation. Embedded in Python, it leverages just-in-time (JIT) compilation frameworks such as LLVM to accelerate compute-intensive Python code by compiling it to native GPU or CPU instructions. It is currently supported on ROCm 6.3.2. For more information, see [Taichi compatibility](https://rocm.docs.amd.com/en/docs-6.4.3/compatibility/ml-compatibility/taichi-compatibility.html).
* Megablocks is a light-weight library for mixture-of-experts (MoE) training. The core of the system is efficient "dropless-MoE" and standard MoE layers. Megablocks is integrated with Megatron-LM, where data and pipeline parallel training of MoEs is supported. It is currently supported on ROCm 6.3.0. For more information, see [Megablocks compatibility](https://rocm.docs.amd.com/en/docs-6.4.3/compatibility/ml-compatibility/megablocks-compatibility.html).
* The [Data types and precision support](https://rocm.docs.amd.com/en/latest/reference/precision-support.html) topic now includes new hardware and library support information.

View File

@@ -1,12 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<manifest>
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
<default revision="refs/tags/rocm-6.4.3"
<default revision="refs/tags/20250912-42"
remote="rocm-org"
sync-c="true"
sync-j="4" />
<!--list of projects for ROCm-->
<project name="ROCK-Kernel-Driver" />
<project name="aqlprofile" />
<project name="ROCR-Runtime" />
<project name="amdsmi" />
<project name="rdc" />
@@ -37,36 +37,37 @@
<project name="rocr_debug_agent" />
<!-- ROCm Libraries -->
<project groups="mathlibs" name="AMDMIGraphX" />
<project groups="mathlibs" name="MIOpen" />
<project groups="mathlibs" name="MIVisionX" />
<project groups="mathlibs" name="ROCmValidationSuite" />
<project groups="mathlibs" name="Tensile" />
<project groups="mathlibs" name="composable_kernel" />
<project groups="mathlibs" name="hipBLAS-common" />
<project groups="mathlibs" name="hipBLAS" />
<project groups="mathlibs" name="hipBLASLt" />
<project groups="mathlibs" name="hipCUB" />
<project groups="mathlibs" name="hipFFT" />
<project groups="mathlibs" name="hipRAND" />
<project groups="mathlibs" name="hipSOLVER" />
<project groups="mathlibs" name="hipSPARSE" />
<project groups="mathlibs" name="hipSPARSELt" />
<project groups="mathlibs" name="hipTensor" />
<project groups="mathlibs" name="hipfort" />
<project groups="mathlibs" name="rccl" />
<project groups="mathlibs" name="rocAL" />
<project groups="mathlibs" name="rocALUTION" />
<project groups="mathlibs" name="rocBLAS" />
<project groups="mathlibs" name="rocDecode" />
<project groups="mathlibs" name="rocJPEG" />
<project groups="mathlibs" name="rocm-libraries">
<linkfile src="projects/hipcub" dest="hipCUB"/>
<linkfile src="projects/rocprim" dest="rocPRIM"/>
<linkfile src="projects/hiprand" dest="hipRAND"/>
<linkfile src="projects/rocrand" dest="rocRAND"/>
<linkfile src="projects/rocthrust" dest="rocThrust"/>
<linkfile src="projects/hipblas-common" dest="hipBLAS-common"/>
<linkfile src="projects/hipblaslt" dest="hipBLASLt"/>
<linkfile src="projects/rocblas" dest="rocBLAS"/>
<linkfile src="projects/hipsparselt" dest="hipSPARSELt"/>
<linkfile src="projects/rocsparse" dest="rocSPARSE"/>
<linkfile src="projects/hipsparse" dest="hipSPARSE"/>
<linkfile src="projects/hipblas" dest="hipBLAS"/>
<linkfile src="projects/miopen" dest="MIOpen"/>
<linkfile src="projects/hipfft" dest="hipFFT"/>
<linkfile src="projects/rocfft" dest="rocFFT"/>
</project>
<project groups="mathlibs" name="rocPyDecode" />
<project groups="mathlibs" name="rocFFT" />
<project groups="mathlibs" name="rocPRIM" />
<project groups="mathlibs" name="rocRAND" />
<project groups="mathlibs" name="rocSHMEM" />
<project groups="mathlibs" name="rocSOLVER" />
<project groups="mathlibs" name="rocSPARSE" />
<project groups="mathlibs" name="rocThrust" />
<project groups="mathlibs" name="rocWMMA" />
<project groups="mathlibs" name="rocm-cmake" />
<project groups="mathlibs" name="rpp" />

View File

@@ -35,6 +35,8 @@ ROCm Version,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
:doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
:doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat]_,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,N/A,N/A,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,
1 ROCm Version 6.4.3 6.4.2 6.4.1 6.4.0 6.3.3 6.3.2 6.3.1 6.3.0 6.2.4 6.2.2 6.2.1 6.2.0 6.1.5 6.1.2 6.1.1 6.1.0 6.0.2 6.0.0
35 :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_ N/A N/A N/A 2.4.0 N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A
36 :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_ N/A N/A N/A N/A N/A N/A N/A 0.7.0 N/A N/A N/A N/A N/A N/A N/A N/A N/A
37 :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_ N/A N/A N/A N/A N/A 1.8.0b1 N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A
38 :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat]_ N/A N/A 2.48.0.post0 N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A
39 :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_ N/A N/A N/A b5997 N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A
40 `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_ 1.2 1.2 1.2 1.2 1.17.3 1.17.3 1.17.3 1.17.3 1.17.3 1.17.3 1.17.3 1.17.3 1.17.3 1.17.3 1.17.3 1.17.3 1.14.1 1.14.1
41
42

View File

@@ -246,6 +246,8 @@ Expand for full historical view of:
.. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
.. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
.. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
.. [#ray_compat] Ray is only supported on ROCm 6.4.1.
.. [#llama-cpp_compat] llama.cpp is only supported on ROCm 6.4.0.
.. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
.. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.

View File

@@ -0,0 +1,156 @@
:orphan:
.. meta::
:description: llama.cpp deep learning framework compatibility
:keywords: GPU, GGML, llama.cpp compatibility
.. version-set:: rocm_version latest
********************************************************************************
llama.cpp compatibility
********************************************************************************
`llama.cpp <https://github.com/ggml-org/llama.cpp>`__ is an open-source framework
for Large Language Model (LLM) inference that runs on both central processing units
(CPUs) and graphics processing units (GPUs). It is written in plain C/C++, providing
a simple, dependency-free setup.
The framework supports multiple quantization options, from 1.5-bit to 8-bit integers,
to speed up inference and reduce memory usage. Originally built as a CPU-first library,
llama.cpp is easy to integrate with other programming environments and is widely
adopted across diverse platforms, including consumer devices.
ROCm support for llama.cpp is upstreamed, and you can build the official source code
with ROCm support:
- ROCm support for llama.cpp is hosted in the official `https://github.com/ROCm/llama.cpp
<https://github.com/ROCm/llama.cpp>`_ repository.
- Due to independent compatibility considerations, this location differs from the
`https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`_ upstream repository.
- To install llama.cpp, use the prebuilt :ref:`Docker image <llama-cpp-docker-compat>`,
which includes ROCm, llama.cpp, and all required dependencies.
- See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
to install and get started.
- See the `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip>`__
in the upstream llama.cpp documentation.
.. note::
llama.cpp is supported on ROCm 6.4.0.
Supported devices
================================================================================
**Officially Supported**: AMD Instinct™ MI300X, MI210
Use cases and recommendations
================================================================================
llama.cpp can be applied in a variety of scenarios, particularly when you need to meet one or more of the following requirements:
- Plain C/C++ implementation with no external dependencies
- Support for 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory usage
- Custom HIP (Heterogeneous-compute Interface for Portability) kernels for running large language models (LLMs) on AMD GPUs (graphics processing units)
- CPU (central processing unit) + GPU (graphics processing unit) hybrid inference for partially accelerating models larger than the total available VRAM (video random-access memory)
llama.cpp is also used in a range of real-world applications, including:
- Games such as `Lucy's Labyrinth <https://github.com/MorganRO8/Lucys_Labyrinth>`__:
A simple maze game where AI-controlled agents attempt to trick the player.
- Tools such as `Styled Lines <https://marketplace.unity.com/packages/tools/ai-ml-integration/style-text-webgl-ios-stand-alone-llm-llama-cpp-wrapper-292902>`__:
A proprietary, asynchronous inference wrapper for Unity3D game development, including pre-built mobile and web platform wrappers and a model example.
- Various other AI applications use llama.cpp as their inference engine;
for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__,
blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``,
server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for
AMD Instinct GPUs within the ROCm ecosystem.
.. _llama-cpp-docker-compat:
Docker image compatibility
================================================================================
.. |docker-icon| raw:: html
<i class="fab fa-docker"></i>
AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp>`__
with ROCm backends on Docker Hub. The following Docker image tags and associated
inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
Click |docker-icon| to view the image on Docker Hub.
.. important::
Tag endings of ``_full``, ``_server``, and ``_light`` serve different purposes for entrypoints as follows:
- Full: This image includes both the main executable file and the tools to convert ``LLaMA`` models into ``ggml`` and convert into 4-bit quantization.
- Server: This image only includes the server executable file.
- Light: This image only includes the main executable file.
.. list-table::
:header-rows: 1
:class: docker-image-compatibility
* - Full Docker
- Server Docker
- Light Docker
- llama.cpp
- Ubuntu
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_full/images/sha256-f78f6c81ab2f8e957469415fe2370a1334fe969c381d1fe46050c85effaee9d5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
- .. raw:: html
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_server/images/sha256-275ad9e18f292c26a00a2de840c37917e98737a88a3520bdc35fd3fc5c9a6a9b"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
- .. raw:: html
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_light/images/sha256-cc324e6faeedf0e400011f07b49d2dc41a16bae257b2b7befa0f4e2e97231320"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
- `b5997 <https://github.com/ROCm/llama.cpp/tree/release/b5997>`__
- 24.04
Key ROCm libraries for llama.cpp
================================================================================
llama.cpp functionality on ROCm is determined by its underlying library
dependencies. These ROCm components affect the capabilities, performance, and
feature set available to developers.
.. list-table::
:header-rows: 1
* - ROCm library
- Version
- Purpose
- Usage
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
- :version-ref:`hipBLAS rocm_version`
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
matrix and vector operations.
- Supports operations such as matrix multiplication, matrix-vector
products, and tensor contractions. Utilized in both dense and batched
linear algebra operations.
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
- :version-ref:`hipBLASLt rocm_version`
- hipBLASLt is an extension of the hipBLAS library, providing additional
features like epilogues fused into the matrix multiplication kernel or
use of integer tensor cores.
- By setting the flag ``ROCBLAS_USE_HIPBLASLT``, you can dispatch hipblasLt
kernels where possible.
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`__
- :version-ref:`rocWMMA rocm_version`
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
multiplication (GEMM) and accumulation operations with mixed precision
support.
- Can be used to enhance the flash attention performance on AMD compute, by enabling
the flag during compile time.

View File

@@ -0,0 +1,111 @@
:orphan:
.. meta::
:description: Ray deep learning framework compatibility
:keywords: GPU, Ray compatibility
.. version-set:: rocm_version latest
*******************************************************************************
Ray compatibility
*******************************************************************************
Ray is a unified framework for scaling AI and Python applications from your laptop
to a full cluster, without changing your code. Ray consists of `a core distributed
runtime <https://docs.ray.io/en/latest/ray-core/walkthrough.html>`_ and a set of
`AI libraries <https://docs.ray.io/en/latest/ray-air/getting-started.html>`_ for
simplifying machine learning computations.
Ray is a general-purpose framework that runs many types of workloads efficiently.
Any Python application can be scaled with Ray, without extra infrastructure.
ROCm support for Ray is upstreamed, and you can build the official source code
with ROCm support:
- ROCm support for Ray is hosted in the official `https://github.com/ROCm/ray
<https://github.com/ROCm/ray>`_ repository.
- Due to independent compatibility considerations, this location differs from the
`https://github.com/ray-project/ray <https://github.com/ray-project/ray>`_ upstream repository.
- To install Ray, use the prebuilt :ref:`Docker image <ray-docker-compat>`
which includes ROCm, Ray, and all required dependencies.
- See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>`
for instructions to get started.
- See the `Installation section <https://docs.ray.io/en/latest/ray-overview/installation.html>`_
in the upstream Ray documentation.
- The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__
corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
.. note::
Ray is supported on ROCm 6.4.1.
Supported devices
================================================================================
**Officially Supported**: AMD Instinct™ MI300X, MI210
Use cases and recommendations
================================================================================
* The `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm
Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__
blog provides an overview of Volcano Engine Reinforcement Learning (verl)
for large language models (LLMs) and discusses its benefits in large-scale
reinforcement learning from human feedback (RLHF). It uses Ray as part of a
hybrid orchestration engine to schedule and coordinate training and inference
tasks in parallel, enabling optimized resource utilization and potential overlap
between these phases. This dynamic resource allocation strategy significantly
improves overall system efficiency. The blog presents verls performance results,
focusing on throughput and convergence accuracy achieved on AMD Instinct™ MI300X
GPUs. Follow this guide to get started with verl on AMD Instinct GPUs and
accelerate your RLHF training with ROCm-optimized performance.
* The `Exploring Use Cases for Scalable AI: Implementing Ray with ROCm Support for Efficient ML Workflows
<https://rocm.blogs.amd.com/artificial-intelligence/rocm-ray/README.html>`__
blog post describes key use cases such as training and inference for large language models (LLMs),
model serving, hyperparameter tuning, reinforcement learning, and the orchestration of large-scale
workloads using Ray in the ROCm environment.
For more use cases and recommendations, see the AMD GPU tabs in the `Accelerator Support
topic <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#accelerator-support>`__
of the Ray core documentation and refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
where you can search for Ray examples and best practices to optimize your workloads on AMD GPUs.
.. _ray-docker-compat:
Docker image compatibility
================================================================================
.. |docker-icon| raw:: html
<i class="fab fa-docker"></i>
AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
with ROCm backends on Docker Hub. The following Docker image tags and
associated inventories represent the latest Ray version from the official Docker Hub and are validated for
`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
icon to view the image on Docker Hub.
.. list-table::
:header-rows: 1
:class: docker-image-compatibility
* - Docker image
- Ray
- Pytorch
- Ubuntu
- Python
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
- `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
- 2.6.0+git684f6f2
- 24.04
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_

View File

@@ -108,6 +108,8 @@ article_pages = [
{"file": "compatibility/ml-compatibility/dgl-compatibility", "os": ["linux"]},
{"file": "compatibility/ml-compatibility/megablocks-compatibility", "os": ["linux"]},
{"file": "compatibility/ml-compatibility/taichi-compatibility", "os": ["linux"]},
{"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
{"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
{"file": "how-to/deep-learning-rocm", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/index", "os": ["linux"]},

View File

@@ -78,7 +78,11 @@ vllm_benchmark:
model_repo: Qwen/QwQ-32B
url: https://huggingface.co/Qwen/QwQ-32B
precision: float16
tunableop: true
- model: Qwen3 30B A3B
mad_tag: pyt_vllm_qwen3-30b-a3b
model_repo: Qwen/Qwen3-30B-A3B
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
precision: float16
- group: Microsoft Phi
tag: phi
models:

View File

@@ -0,0 +1,72 @@
dockers:
- pull_tag: rocm/jax-training:maxtext-v25.7
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
components:
ROCm: 6.4.1
JAX: 0.5.0
Python: 3.10.12
Transformer Engine: 2.1.0+90d703dd
hipBLASLt: 1.x.x
- pull_tag: rocm/jax-training:maxtext-v25.7-jax060
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
components:
ROCm: 6.4.1
JAX: 0.6.0
Python: 3.10.12
Transformer Engine: 2.1.0+90d703dd
hipBLASLt: 1.1.0-499ece1c21
model_groups:
- group: Meta Llama
tag: llama
models:
- model: Llama 3.3 70B
mad_tag: jax_maxtext_train_llama-3.3-70b
model_repo: Llama-3.3-70B
precision: bf16
doc_options: ["single-node"]
- model: Llama 3.1 8B
mad_tag: jax_maxtext_train_llama-3.1-8b
model_repo: Llama-3.1-8B
precision: bf16
doc_options: ["single-node"]
- model: Llama 3.1 70B
mad_tag: jax_maxtext_train_llama-3.1-70b
model_repo: Llama-3.1-70B
precision: bf16
doc_options: ["single-node"]
- model: Llama 3 8B
mad_tag: jax_maxtext_train_llama-3-8b
multinode_training_script: llama3_8b_multinode.sh
doc_options: ["multi-node"]
- model: Llama 3 70B
mad_tag: jax_maxtext_train_llama-3-70b
multinode_training_script: llama3_70b_multinode.sh
doc_options: ["multi-node"]
- model: Llama 2 7B
mad_tag: jax_maxtext_train_llama-2-7b
model_repo: Llama-2-7B
precision: bf16
multinode_training_script: llama2_7b_multinode.sh
doc_options: ["single-node", "multi-node"]
- model: Llama 2 70B
mad_tag: jax_maxtext_train_llama-2-70b
model_repo: Llama-2-70B
precision: bf16
multinode_training_script: llama2_70b_multinode.sh
doc_options: ["single-node", "multi-node"]
- group: DeepSeek
tag: deepseek
models:
- model: DeepSeek-V2-Lite (16B)
mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
model_repo: DeepSeek-V2-lite
precision: bf16
doc_options: ["single-node"]
- group: Mistral AI
tag: mistral
models:
- model: Mixtral 8x7B
mad_tag: jax_maxtext_train_mixtral-8x7b
model_repo: Mixtral-8x7B
precision: bf16
doc_options: ["single-node"]

View File

@@ -110,6 +110,28 @@ The table below summarizes information about ROCm-enabled deep learning framewor
<a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
* - `Ray <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/ray-compatibility.html>`__
- .. raw:: html
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html"><i class="fas fa-link fa-lg"></i></a>
-
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#using-a-prebuilt-docker-image-with-ray-pre-installed>`__
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#install-ray-on-bare-metal-or-a-custom-container>`__
- `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#build-your-own-docker-image>`__
- .. raw:: html
<a href="https://github.com/ROCm/ray"><i class="fab fa-github fa-lg"></i></a>
* - `llama.cpp <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/llama-cpp-compatibility.html>`__
- .. raw:: html
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html"><i class="fas fa-link fa-lg"></i></a>
-
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#use-a-prebuilt-docker-image-with-llama-cpp-pre-installed>`__
- .. raw:: html
<a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>
Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
through the following guides.

View File

@@ -2,9 +2,9 @@
:description: How to train a model using JAX MaxText for ROCm.
:keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
**************************************
Training a model with MaxText for ROCm
**************************************
******************************************
Training a model with JAX MaxText for ROCm
******************************************
MaxText is a high-performance, open-source framework built on the Google JAX
machine learning library to train LLMs at scale. The MaxText framework for
@@ -12,70 +12,108 @@ ROCm is an optimized fork of the upstream
`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
on AMD MI300X series accelerators.
The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.5``) image
The MaxText for ROCm training Docker image
provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
It includes the following software components:
+--------------------------+--------------------------------+
| Software component | Version |
+==========================+================================+
| ROCm | 6.3.4 |
+--------------------------+--------------------------------+
| JAX | 0.4.35 |
+--------------------------+--------------------------------+
| Python | 3.10.12 |
+--------------------------+--------------------------------+
| Transformer Engine | 1.12.0.dev0+b8b92dc |
+--------------------------+--------------------------------+
| hipBLASLt | 0.13.0-ae9c477a |
+--------------------------+--------------------------------+
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
Supported features and models
=============================
{% set dockers = data.dockers %}
.. tab-set::
MaxText provides the following key features to train large language models efficiently:
{% for docker in dockers %}
{% set jax_version = docker.components["JAX"] %}
.. tab-item:: JAX {{ jax_version }}
:sync: {{ docker.pull_tag }}
.. list-table::
:header-rows: 1
* - Software component
- Version
{% for component_name, component_version in docker.components.items() %}
* - {{ component_name }}
- {{ component_version }}
{% endfor %}
{% if jax_version == "0.6.0" %}
.. note::
Shardy is a new config in JAX 0.6.0. You might get related errors if it's
not configured correctly. For now you can turn it off by setting
``shardy=False`` during the training run. You can also follow the `migration
guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
it.
The provided multi-node training scripts in this documentation are
not currently supported with JAX 0.6.0. For multi-node training, use the JAX 0.5.0
Docker image.
{% endif %}
{% endfor %}
MaxText with on ROCm provides the following key features to train large language models efficiently:
- Transformer Engine (TE)
- Flash Attention (FA) 3
- Flash Attention (FA) 3 -- with or without sequence input packing
- GEMM tuning
- Multi-node support
.. _amd-maxtext-model-support:
- NANOO FP8 quantization support
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
.. _amd-maxtext-model-support-v257:
* Llama 3.3 70B
Supported models
================
* Llama 3.1 8B
The following models are pre-optimized for performance on AMD Instinct MI300
series accelerators. Some instructions, commands, and available training
configurations in this documentation might vary by model -- select one to get
started.
* Llama 3.1 70B
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
* Llama 3 8B
{% set model_groups = data.model_groups %}
.. raw:: html
* Llama 3 70B
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
<div class="row">
<div class="col-2 me-2 model-param-head">Model</div>
<div class="row col-10">
{% for model_group in model_groups %}
<div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
* Llama 2 7B
* Llama 2 70B
* DeepSeek-V2-Lite
<div class="row mt-1">
<div class="col-2 me-2 model-param-head">Model variant</div>
<div class="row col-10">
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
{% if models|length % 3 == 0 %}
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% else %}
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% endif %}
{% endfor %}
{% endfor %}
</div>
</div>
</div>
.. note::
Some models, such as Llama 3, require an external license agreement through
a third party (for example, Meta).
Unsupported features
--------------------
Currently, MaxText's default packed input format is not supported. Using this format
with the current Docker image results in incorrect attention calculations
across different input sequences. Support for packed input format is planned for a future release.
System validation
=================
@@ -98,14 +136,14 @@ This Docker image is optimized for specific model configurations outlined
as follows. Performance can vary for other training workloads, as AMD
doesnt validate configurations and run conditions outside those described.
.. _amd-maxtext-multi-node-setup:
.. _amd-maxtext-multi-node-setup-v257:
Multi-node setup
----------------
For multi-node environments, ensure you have all the necessary packages for
your network device, such as, RDMA. If you're not using a multi-node setup
with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
with RDMA, skip ahead to :ref:`amd-maxtext-get-started-v257`.
1. Install the following packages to build and install the RDMA driver.
@@ -180,196 +218,203 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
# If using Mellanox NIC
export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
.. _amd-maxtext-download-docker:
.. _amd-maxtext-get-started-v257:
Pull the Docker image
---------------------
Benchmarking
============
1. Use the following command to pull the Docker image from Docker Hub.
Once the setup is complete, choose between two options to reproduce the
benchmark results:
.. code-block:: shell
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
docker pull rocm/jax-training:maxtext-v25.5
.. _vllm-benchmark-mad:
2. Use the following command to launch the Docker container. Note that the benchmarking scripts
used in the :ref:`following section <amd-maxtext-get-started>` automatically launch the Docker container
and execute the benchmark.
{% set dockers = data.dockers %}
{% set model_groups = data.model_groups %}
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. code-block:: shell
.. container:: model-doc {{model.mad_tag}}
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.5
.. tab-set::
.. _amd-maxtext-get-started:
{% if model.mad_tag and "single-node" in model.doc_options %}
.. tab-item:: MAD-integrated benchmarking
Getting started
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD
pip install -r requirements.txt
2. Use this command to run the performance benchmark test on the {{ model.model }} model
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
.. code-block:: shell
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
madengine run \
--tags {{model.mad_tag}} \
--keep-model-dir \
--live-output \
--timeout 28800
MAD launches a Docker container with the name
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
model are collected in the following path: ``~/MAD/perf.csv/``.
{% endif %}
.. tab-item:: Standalone benchmarking
.. rubric:: Download the Docker image and required scripts
Run the JAX MaxText benchmark tool independently by starting the
Docker container as shown in the following snippet.
.. tab-set::
{% for docker in dockers %}
{% set jax_version = docker.components["JAX"] %}
.. tab-item:: JAX {{ jax_version }}
:sync: {{ docker.pull_tag }}
.. code-block:: shell
docker pull {{ docker.pull_tag }}
{% endfor %}
{% if model.model_repo and "single-node" in model.doc_options %}
.. rubric:: Single node training
1. Set up environment variables.
.. code-block:: shell
export MAD_SECRETS_HFTOKEN=<Your Hugging Face token>
export HF_HOME=<Location of saved/cached Hugging Face models>
``MAD_SECRETS_HFTOKEN`` is your Hugging Face access token to access models, tokenizers, and data.
See `User access tokens <https://huggingface.co/docs/hub/en/security-tokens>`__.
``HF_HOME`` is where ``huggingface_hub`` will store local data. See `huggingface_hub CLI <https://huggingface.co/docs/huggingface_hub/main/en/guides/cli#huggingface-cli-download>`__.
If you already have downloaded or cached Hugging Face artifacts, set this variable to that path.
Downloaded files typically get cached to ``~/.cache/huggingface``.
2. Launch the Docker container.
.. tab-set::
{% for docker in dockers %}
{% set jax_version = docker.components["JAX"] %}
.. tab-item:: JAX {{ jax_version }}
:sync: {{ docker.pull_tag }}
.. code-block:: shell
docker run -it \
--device=/dev/dri \
--device=/dev/kfd \
--network host \
--ipc host \
--group-add video \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME:$HOME \
-v $HOME/.ssh:/root/.ssh \
-v $HF_HOME:/hf_cache \
-e HF_HOME=/hf_cache \
-e MAD_SECRETS_HFTOKEN=$MAD_SECRETS_HFTOKEN
--shm-size 64G \
--name training_env \
{{ docker.pull_tag }}
{% endfor %}
3. In the Docker container, clone the ROCm MAD repository and navigate to the
benchmark scripts directory at ``MAD/scripts/jax-maxtext``.
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD/scripts/jax-maxtext
4. Run the setup scripts to install libraries and datasets needed
for benchmarking.
.. code-block:: shell
./jax-maxtext_benchmark_setup.sh -m {{ model.model_repo }}
5. To run the training benchmark without quantization, use the following command:
.. code-block:: shell
./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }}
For quantized training, use the following command:
.. code-block:: shell
./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8
.. important::
Quantized training is not supported with the JAX 0.6.0 Docker image; support
will be added in a future release. For quantized training, use the JAX 0.5.0
Docker image: ``rocm/jax-training:maxtext-v25.7``.
{% endif %}
{% if model.multinode_training_script and "multi-node" in model.doc_options %}
.. rubric:: Multi-node training
The following examples use SLURM to run on multiple nodes.
.. note::
The following scripts will launch the Docker container and run the
benchmark. Run them outside of any Docker container.
1. Make sure ``$HF_HOME`` is set before running the test. See
`ROCm benchmarking <https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__
for more details on downloading the Llama models before running the
benchmark.
2. To run multi-node training for {{ model.model }},
use the
`multi-node training script <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/{{ model.multinode_training_script }}>`__
under the ``scripts/jax-maxtext/gpu-rocm/`` directory.
3. Run the multi-node training benchmark script.
.. code-block:: shell
sbatch -N <num_nodes> {{ model.multinode_training_script }}
{% else %}
.. rubric:: Multi-node training
For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v257`
with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
{% endif %}
{% endfor %}
{% endfor %}
Further reading
===============
The following examples demonstrate how to get started with single node
and multi-node training using the benchmarking scripts provided at
`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
- See the ROCm/maxtext benchmarking README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__.
.. important::
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
- To learn more about system settings and management practices to configure your system for
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
set correctly and points to your Hugging Face cache directory. Refer to the
README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
for more detailed instructions.
Single node training benchmarking examples
------------------------------------------
* Example 1: Single node training with Llama 2 7B
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
Run the single node training benchmark:
.. code-block:: shell
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_7b.sh
* Example 2: Single node training with Llama 2 70B
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
Run the single node training benchmark:
.. code-block:: shell
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_70b.sh
* Example 3: Single node training with Llama 3 8B
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
Run the single node training benchmark:
.. code-block:: shell
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_8b.sh
* Example 4: Single node training with Llama 3 70B
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
Run the single node training benchmark:
.. code-block:: shell
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_70b.sh
* Example 5: Single node training with Llama 3.3 70B
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3.3_70b.sh
Run the single node training benchmark:
.. code-block:: shell
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3.3_70b.sh
* Example 6: Single node training with DeepSeek V2 16B
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
Run the single node training benchmark:
.. code-block:: shell
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./deepseek_v2_16b.sh
.. note::
The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
the tokens/s as a performance indicator.
Multi-node training benchmarking examples
-----------------------------------------
The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
own cluster setup.
* Example 1: Multi-node training with Llama 2 7B
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
Run the multi-node training benchmark. For example:
.. code-block:: shell
sbatch -N <num_nodes> llama2_7b_multinode.sh
* Example 2: Multi-node training with Llama 2 70B
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
Run the multi-node training benchmark. For example:
.. code-block:: shell
sbatch -N <num_nodes> llama2_70b_multinode.sh
* Example 3: Multi-node training with Llama 3 8B model
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
Run the multi-node training benchmark. For example:
.. code-block:: shell
sbatch -N <num_nodes> llama3_8b_multinode.sh
* Example 4: Multi-node training with Llama 3 70B model
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
Run the multi-node training benchmark. For example:
.. code-block:: shell
sbatch -N <num_nodes> llama3_70b_multinode.sh
- For a list of other ready-made Docker images for AI with ROCm, see
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
Previous versions
=================

View File

@@ -17,12 +17,21 @@ previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <http
- Components
- Resources
* - 25.5 (latest)
* - 25.7 (latest)
-
* ROCm 6.4.1
* JAX 0.6.0, 0.5.0
-
* :doc:`Documentation <../jax-maxtext>`
* `Docker Hub (JAX 0.6.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7-jax060/images/sha256-7352212ae033a76dca2b9dceffc23c1b5f1a61a7a560082cf747a9bf1acfc9ce>`__
* `Docker Hub (JAX 0.5.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025>`__
* - 25.5
-
* ROCm 6.3.4
* JAX 0.4.35
-
* :doc:`Documentation <../jax-maxtext>`
* :doc:`Documentation <jax-maxtext-v25.5>`
* `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`__
* - 25.4

View File

@@ -51,7 +51,7 @@ MaxText provides the following key features to train large language models effic
- Multi-node support
.. _amd-maxtext-model-support:
.. _amd-maxtext-model-support-v254:
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.

View File

@@ -0,0 +1,385 @@
:orphan:
.. meta::
:description: How to train a model using JAX MaxText for ROCm.
:keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
**************************************
Training a model with MaxText for ROCm
**************************************
.. caution::
This documentation does not reflect the latest version of ROCm JAX MaxText
training performance documentation. See :doc:`../jax-maxtext` for the latest version.
MaxText is a high-performance, open-source framework built on the Google JAX
machine learning library to train LLMs at scale. The MaxText framework for
ROCm is an optimized fork of the upstream
`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
on AMD MI300X series accelerators.
The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.5``) image
provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
It includes the following software components:
+--------------------------+--------------------------------+
| Software component | Version |
+==========================+================================+
| ROCm | 6.3.4 |
+--------------------------+--------------------------------+
| JAX | 0.4.35 |
+--------------------------+--------------------------------+
| Python | 3.10.12 |
+--------------------------+--------------------------------+
| Transformer Engine | 1.12.0.dev0+b8b92dc |
+--------------------------+--------------------------------+
| hipBLASLt | 0.13.0-ae9c477a |
+--------------------------+--------------------------------+
Supported features and models
=============================
MaxText provides the following key features to train large language models efficiently:
- Transformer Engine (TE)
- Flash Attention (FA) 3
- GEMM tuning
- Multi-node support
.. _amd-maxtext-model-support-v255:
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
* Llama 3.3 70B
* Llama 3.1 8B
* Llama 3.1 70B
* Llama 3 8B
* Llama 3 70B
* Llama 2 7B
* Llama 2 70B
* DeepSeek-V2-Lite
.. note::
Some models, such as Llama 3, require an external license agreement through
a third party (for example, Meta).
Unsupported features
--------------------
Currently, MaxText's default packed input format is not supported. Using this format
with the current Docker image results in incorrect attention calculations
across different input sequences. Support for packed input format is planned for a future release.
System validation
=================
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
before starting training.
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
Environment setup
=================
This Docker image is optimized for specific model configurations outlined
as follows. Performance can vary for other training workloads, as AMD
doesnt validate configurations and run conditions outside those described.
.. _amd-maxtext-multi-node-setup-v255:
Multi-node setup
----------------
For multi-node environments, ensure you have all the necessary packages for
your network device, such as, RDMA. If you're not using a multi-node setup
with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
1. Install the following packages to build and install the RDMA driver.
.. code-block:: shell
sudo apt install iproute2 -y
sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
Refer to your NIC manufacturer's documentation for further steps on
compiling and installing the RoCE driver. For example, for Broadcom,
see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.
2. Set the following environment variables.
a. Master address
Change ``localhost`` to the master node's resolvable hostname or IP address:
.. code-block:: bash
export MASTER_ADDR="${MASTER_ADDR:-localhost}"
b. Number of nodes
Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
.. code-block:: bash
export NNODES="${NNODES:-1}"
c. Node ranks
Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
Node ranks should be unique across all nodes in the cluster.
.. code-block:: bash
export NODE_RANK="${NODE_RANK:-0}"
d. Network interface
Update the network interface in the script to match your system's network interface. To
find your network interface, run the following (outside of any Docker container):
.. code-block:: bash
ip a
Look for an active interface with an IP address in the same subnet as
your other nodes. Then, update the following variable in the script, for
example:
.. code-block:: bash
export NCCL_SOCKET_IFNAME=ens50f0np0
This variable specifies which network interface to use for inter-node communication.
Setting this variable to the incorrect interface can result in communication failures
or significantly reduced performance.
e. RDMA interface
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
Then, set the RDMA interfaces to use for communication.
.. code-block:: bash
# If using Broadcom NIC
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
# If using Mellanox NIC
export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
.. _amd-maxtext-download-docker-v255:
Pull the Docker image
---------------------
1. Use the following command to pull the Docker image from Docker Hub.
.. code-block:: shell
docker pull rocm/jax-training:maxtext-v25.5
2. Use the following command to launch the Docker container. Note that the benchmarking scripts
used in the :ref:`following section <amd-maxtext-get-started>` automatically launch the Docker container
and execute the benchmark.
.. code-block:: shell
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.5
.. _amd-maxtext-get-started-v255:
Getting started
===============
The following examples demonstrate how to get started with single node
and multi-node training using the benchmarking scripts provided at
`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
.. important::
The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
set correctly and points to your Hugging Face cache directory. Refer to the
README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
for more detailed instructions.
Single node training benchmarking examples
------------------------------------------
* Example 1: Single node training with Llama 2 7B
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
Run the single node training benchmark:
.. code-block:: shell
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_7b.sh
* Example 2: Single node training with Llama 2 70B
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
Run the single node training benchmark:
.. code-block:: shell
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_70b.sh
* Example 3: Single node training with Llama 3 8B
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
Run the single node training benchmark:
.. code-block:: shell
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_8b.sh
* Example 4: Single node training with Llama 3 70B
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
Run the single node training benchmark:
.. code-block:: shell
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_70b.sh
* Example 5: Single node training with Llama 3.3 70B
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3.3_70b.sh
Run the single node training benchmark:
.. code-block:: shell
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3.3_70b.sh
* Example 6: Single node training with DeepSeek V2 16B
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
Run the single node training benchmark:
.. code-block:: shell
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./deepseek_v2_16b.sh
.. note::
The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
the tokens/s as a performance indicator.
Multi-node training benchmarking examples
-----------------------------------------
The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
own cluster setup.
* Example 1: Multi-node training with Llama 2 7B
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
Run the multi-node training benchmark. For example:
.. code-block:: shell
sbatch -N <num_nodes> llama2_7b_multinode.sh
* Example 2: Multi-node training with Llama 2 70B
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
Run the multi-node training benchmark. For example:
.. code-block:: shell
sbatch -N <num_nodes> llama2_70b_multinode.sh
* Example 3: Multi-node training with Llama 3 8B model
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
Run the multi-node training benchmark. For example:
.. code-block:: shell
sbatch -N <num_nodes> llama3_8b_multinode.sh
* Example 4: Multi-node training with Llama 3 70B model
Download the benchmarking script:
.. code-block:: shell
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
Run the multi-node training benchmark. For example:
.. code-block:: shell
sbatch -N <num_nodes> llama3_70b_multinode.sh
Previous versions
=================
See :doc:`jax-maxtext-history` to find documentation for previous releases
of the ``ROCm/jax-training`` Docker image.

View File

@@ -32,19 +32,23 @@ subtrees:
- file: compatibility/ml-compatibility/pytorch-compatibility.rst
title: PyTorch compatibility
- file: compatibility/ml-compatibility/tensorflow-compatibility.rst
title: TensorFlow compatibility
title: TensorFlow compatibility
- file: compatibility/ml-compatibility/jax-compatibility.rst
title: JAX compatibility
- file: compatibility/ml-compatibility/verl-compatibility.rst
title: verl compatibility
title: verl compatibility
- file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
title: Stanford Megatron-LM compatibility
- file: compatibility/ml-compatibility/dgl-compatibility.rst
title: DGL compatibility
title: DGL compatibility
- file: compatibility/ml-compatibility/megablocks-compatibility.rst
title: Megablocks compatibility
- file: compatibility/ml-compatibility/taichi-compatibility.rst
title: Taichi compatibility
title: Taichi compatibility
- file: compatibility/ml-compatibility/ray-compatibility.rst
title: Ray compatibility
- file: compatibility/ml-compatibility/llama-cpp-compatibility.rst
title: llama.cpp compatibility
- file: how-to/build-rocm.rst
title: Build ROCm from source

View File

@@ -0,0 +1,79 @@
<?xml version="1.0" encoding="UTF-8"?>
<manifest>
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
<default revision="refs/tags/20250912-42"
remote="rocm-org"
sync-c="true"
sync-j="4" />
<!--list of projects for ROCm-->
<project name="aqlprofile" />
<project name="ROCR-Runtime" />
<project name="amdsmi" />
<project name="rdc" />
<project name="rocm_bandwidth_test" />
<project name="rocm_smi_lib" />
<project name="rocm-core" />
<project name="rocm-examples" />
<project name="rocminfo" />
<project name="rocprofiler" />
<project name="rocprofiler-register" />
<project name="rocprofiler-sdk" />
<project name="rocprofiler-compute" />
<project name="rocprofiler-systems" />
<project name="roctracer" />
<!--HIP Projects-->
<project name="HIP" />
<project name="hip-tests" />
<project name="HIPIFY" />
<project name="clr" />
<project name="hipother" />
<!-- The following projects are all associated with the AMDGPU LLVM compiler -->
<project name="half" />
<project name="llvm-project" />
<project name="spirv-llvm-translator" />
<!-- gdb projects -->
<project name="ROCdbgapi" />
<project name="ROCgdb" />
<project name="rocr_debug_agent" />
<!-- ROCm Libraries -->
<project groups="mathlibs" name="AMDMIGraphX" />
<project groups="mathlibs" name="MIVisionX" />
<project groups="mathlibs" name="ROCmValidationSuite" />
<project groups="mathlibs" name="composable_kernel" />
<project groups="mathlibs" name="hipSOLVER" />
<project groups="mathlibs" name="hipTensor" />
<project groups="mathlibs" name="hipfort" />
<project groups="mathlibs" name="rccl" />
<project groups="mathlibs" name="rocAL" />
<project groups="mathlibs" name="rocALUTION" />
<project groups="mathlibs" name="rocDecode" />
<project groups="mathlibs" name="rocJPEG" />
<project groups="mathlibs" name="rocm-libraries">
<linkfile src="projects/hipcub" dest="hipCUB"/>
<linkfile src="projects/rocprim" dest="rocPRIM"/>
<linkfile src="projects/hiprand" dest="hipRAND"/>
<linkfile src="projects/rocrand" dest="rocRAND"/>
<linkfile src="projects/rocthrust" dest="rocThrust"/>
<linkfile src="projects/hipblas-common" dest="hipBLAS-common"/>
<linkfile src="projects/hipblaslt" dest="hipBLASLt"/>
<linkfile src="projects/rocblas" dest="rocBLAS"/>
<linkfile src="projects/hipsparselt" dest="hipSPARSELt"/>
<linkfile src="projects/rocsparse" dest="rocSPARSE"/>
<linkfile src="projects/hipsparse" dest="hipSPARSE"/>
<linkfile src="projects/hipblas" dest="hipBLAS"/>
<linkfile src="projects/miopen" dest="MIOpen"/>
<linkfile src="projects/hipfft" dest="hipFFT"/>
<linkfile src="projects/rocfft" dest="rocFFT"/>
</project>
<project groups="mathlibs" name="rocPyDecode" />
<project groups="mathlibs" name="rocSHMEM" />
<project groups="mathlibs" name="rocSOLVER" />
<project groups="mathlibs" name="rocWMMA" />
<project groups="mathlibs" name="rocm-cmake" />
<project groups="mathlibs" name="rpp" />
<project groups="mathlibs" name="TransferBench" />
<!-- Projects for OpenMP-Extras -->
<project name="aomp" path="openmp-extras/aomp" />
<project name="aomp-extras" path="openmp-extras/aomp-extras" />
<project name="flang" path="openmp-extras/flang" />
</manifest>