From 151a4bd7bc7206e4ec2c2cb383e934c54ebddb2a Mon Sep 17 00:00:00 2001 From: Daniel Su Date: Wed, 27 Aug 2025 13:05:26 -0400 Subject: [PATCH] [Ex CI] add retries to potentially flaky steps (#5175) --- .azuredevops/components/MIOpen.yml | 2 + .azuredevops/components/Tensile.yml | 1 + .azuredevops/components/rocPyDecode.yml | 1 + .azuredevops/nightly/pytorch.yml | 1 + .azuredevops/nightly/rocm-nightly.yml | 3 +- .../templates/steps/artifact-download.yml | 1 + .../templates/steps/dependencies-apt.yml | 19 ++---- .../steps/dependencies-aqlprofile.yml | 67 ++++++------------- .../templates/steps/dependencies-dnf.yml | 19 ++++-- .../templates/steps/dependencies-other.yml | 1 + .../templates/steps/dependencies-vendor.yml | 1 + .../steps/local-artifact-download.yml | 1 + .../templates/steps/miopen-get-ck-build.yml | 1 + 13 files changed, 52 insertions(+), 66 deletions(-) diff --git a/.azuredevops/components/MIOpen.yml b/.azuredevops/components/MIOpen.yml index b606005c7..cb2bd8c60 100644 --- a/.azuredevops/components/MIOpen.yml +++ b/.azuredevops/components/MIOpen.yml @@ -150,6 +150,7 @@ jobs: downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }} - task: Bash@3 displayName: Build and install other dependencies + retryCountOnTaskFailure: 3 inputs: targetType: inline workingDirectory: $(Agent.BuildDirectory)/s @@ -230,6 +231,7 @@ jobs: downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }} - task: Bash@3 displayName: Build and install other dependencies + retryCountOnTaskFailure: 3 inputs: targetType: inline workingDirectory: $(Agent.BuildDirectory)/s diff --git a/.azuredevops/components/Tensile.yml b/.azuredevops/components/Tensile.yml index f74cdc56d..3b31727ce 100644 --- a/.azuredevops/components/Tensile.yml +++ b/.azuredevops/components/Tensile.yml @@ -171,6 +171,7 @@ jobs: - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml - task: DownloadPipelineArtifact@2 displayName: 'Download Pipeline Wheel Files' + retryCountOnTaskFailure: 3 inputs: itemPattern: '**/*${{ job.os }}*.whl' targetPath: $(Agent.BuildDirectory) diff --git a/.azuredevops/components/rocPyDecode.yml b/.azuredevops/components/rocPyDecode.yml index 885b5b51c..6e85a43ef 100644 --- a/.azuredevops/components/rocPyDecode.yml +++ b/.azuredevops/components/rocPyDecode.yml @@ -190,6 +190,7 @@ jobs: - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml - task: DownloadPipelineArtifact@2 displayName: 'Download Pipeline Wheel Files' + retryCountOnTaskFailure: 3 inputs: itemPattern: '**/*.whl' targetPath: $(Agent.BuildDirectory) diff --git a/.azuredevops/nightly/pytorch.yml b/.azuredevops/nightly/pytorch.yml index 19daf1d8c..995206f7d 100644 --- a/.azuredevops/nightly/pytorch.yml +++ b/.azuredevops/nightly/pytorch.yml @@ -397,6 +397,7 @@ jobs: - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml - task: DownloadPipelineArtifact@2 displayName: 'Download Pipeline Wheel Files' + retryCountOnTaskFailure: 3 inputs: itemPattern: '**/*.whl' targetPath: $(Agent.BuildDirectory) diff --git a/.azuredevops/nightly/rocm-nightly.yml b/.azuredevops/nightly/rocm-nightly.yml index 75f64304b..5b28e12ae 100644 --- a/.azuredevops/nightly/rocm-nightly.yml +++ b/.azuredevops/nightly/rocm-nightly.yml @@ -93,7 +93,7 @@ schedules: jobs: - ${{ each job in parameters.jobList }}: - job: nightly_${{ job.os }}_${{ job.target }} - timeoutInMinutes: 90 + timeoutInMinutes: 120 variables: - group: common - template: /.azuredevops/variables-global.yml @@ -226,6 +226,7 @@ jobs: cat Dockerfile - task: Docker@2 displayName: Build and upload Docker image + retryCountOnTaskFailure: 3 inputs: containerRegistry: ContainerService3 repository: 'nightly-${{ job.os }}-${{ job.target }}' diff --git a/.azuredevops/templates/steps/artifact-download.yml b/.azuredevops/templates/steps/artifact-download.yml index e5445fe4e..03855af49 100644 --- a/.azuredevops/templates/steps/artifact-download.yml +++ b/.azuredevops/templates/steps/artifact-download.yml @@ -24,6 +24,7 @@ parameters: steps: - task: DownloadPipelineArtifact@2 displayName: Download ${{ parameters.componentName }} + retryCountOnTaskFailure: 3 inputs: ${{ if eq(parameters.componentName, 'clr') }}: itemPattern: '**/*${{ parameters.componentName }}*${{ parameters.fileFilter }}*amd*' # filter out nvidia clr artifacts diff --git a/.azuredevops/templates/steps/dependencies-apt.yml b/.azuredevops/templates/steps/dependencies-apt.yml index a73dc3faa..7a35dcd32 100644 --- a/.azuredevops/templates/steps/dependencies-apt.yml +++ b/.azuredevops/templates/steps/dependencies-apt.yml @@ -10,6 +10,7 @@ steps: - ${{ if eq(parameters.registerROCmPackages, true) }}: - task: Bash@3 displayName: 'Register AMDGPU & ROCm repos (apt)' + retryCountOnTaskFailure: 3 inputs: targetType: inline script: | @@ -20,7 +21,8 @@ steps: echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600 sudo apt update - task: Bash@3 - displayName: 'sudo apt-get update' + displayName: 'APT update and install packages' + retryCountOnTaskFailure: 3 inputs: targetType: inline script: | @@ -28,15 +30,6 @@ steps: echo "deb http://archive.ubuntu.com/ubuntu/ jammy-updates main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list echo "deb http://archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list echo "deb http://archive.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list - sudo DEBIAN_FRONTEND=noninteractive apt-get --yes update -- task: Bash@3 - displayName: 'sudo apt-get fix' - inputs: - targetType: inline - script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-broken install -- ${{ if gt(length(parameters.aptPackages), 0) }}: - - task: Bash@3 - displayName: 'sudo apt-get install ...' - inputs: - targetType: inline - script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-missing install ${{ join(' ', parameters.aptPackages) }} + sudo DEBIAN_FRONTEND=noninteractive apt-get --yes update && \ + sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-broken install && \ + sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-missing install ${{ join(' ', parameters.aptPackages) }} diff --git a/.azuredevops/templates/steps/dependencies-aqlprofile.yml b/.azuredevops/templates/steps/dependencies-aqlprofile.yml index 9c3cf836c..4ff4675d1 100644 --- a/.azuredevops/templates/steps/dependencies-aqlprofile.yml +++ b/.azuredevops/templates/steps/dependencies-aqlprofile.yml @@ -5,51 +5,28 @@ parameters: steps: - task: Bash@3 - displayName: Get aqlprofile package name - inputs: - targetType: inline - ${{ if eq(parameters.os, 'ubuntu2204') }}: - script: | - export packageName=$(curl -s https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb") - echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName" - ${{ if eq(parameters.os, 'almalinux8') }}: - script: | - export packageName=$(curl -s https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/ | grep -oP "hsa-amd-aqlprofile-[^\"]+\.rpm" | head -n1) - echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName" -- task: Bash@3 - displayName: 'Download aqlprofile' - inputs: - targetType: inline - workingDirectory: '$(Pipeline.Workspace)' - ${{ if eq(parameters.os, 'ubuntu2204') }}: - script: wget -nv https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/$(packageName) - ${{ if eq(parameters.os, 'almalinux8') }}: - script: wget -nv https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/$(packageName) -- task: Bash@3 - displayName: 'Extract aqlprofile' - inputs: - targetType: inline - workingDirectory: '$(Pipeline.Workspace)' - ${{ if eq(parameters.os, 'ubuntu2204') }}: - script: | - mkdir hsa-amd-aqlprofile - dpkg-deb -R $(packageName) hsa-amd-aqlprofile - ${{ if eq(parameters.os, 'almalinux8') }}: - script: | - mkdir hsa-amd-aqlprofile - sudo dnf -y install rpm-build cpio - rpm2cpio $(packageName) | (cd hsa-amd-aqlprofile && cpio -idmv) -- task: Bash@3 - displayName: 'Copy aqlprofile files' + displayName: Download and install aqlprofile + retryCountOnTaskFailure: 3 inputs: targetType: inline + workingDirectory: $(Agent.BuildDirectory) script: | - mkdir -p $(Agent.BuildDirectory)/rocm - cp -R hsa-amd-aqlprofile/opt/rocm-*/* $(Agent.BuildDirectory)/rocm - workingDirectory: '$(Pipeline.Workspace)' -- task: Bash@3 - displayName: 'Clean up aqlprofile' - inputs: - targetType: inline - script: rm -rf hsa-amd-aqlprofile $(packageName) - workingDirectory: '$(Pipeline.Workspace)' + set -e + if [ "${{ parameters.os }}" = "ubuntu2204" ]; then + packageName=$(curl -s https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb") && \ + wget -nv https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/$packageName && \ + mkdir -p hsa-amd-aqlprofile && \ + dpkg-deb -R $packageName hsa-amd-aqlprofile + elif [ "${{ parameters.os }}" = "almalinux8" ]; then + sudo dnf -y install rpm-build cpio && \ + packageName=$(curl -s https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/ | grep -oP "hsa-amd-aqlprofile-[^\"]+\.rpm" | head -n1) && \ + wget -nv https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/$packageName && \ + mkdir -p hsa-amd-aqlprofile && \ + rpm2cpio $packageName | (cd hsa-amd-aqlprofile && cpio -idmv) + else + echo "Unsupported OS: ${{ parameters.os }}" + exit 1 + fi && \ + mkdir -p $(Agent.BuildDirectory)/rocm && \ + cp -R hsa-amd-aqlprofile/opt/rocm-*/* $(Agent.BuildDirectory)/rocm && \ + rm -rf hsa-amd-aqlprofile $packageName diff --git a/.azuredevops/templates/steps/dependencies-dnf.yml b/.azuredevops/templates/steps/dependencies-dnf.yml index 432408cf7..81d2a045e 100644 --- a/.azuredevops/templates/steps/dependencies-dnf.yml +++ b/.azuredevops/templates/steps/dependencies-dnf.yml @@ -89,6 +89,7 @@ steps: - ${{ if eq(parameters.registerROCmPackages, true) }}: - task: Bash@3 displayName: 'Register AMDGPU & ROCm repos (dnf)' + retryCountOnTaskFailure: 3 inputs: targetType: inline script: | @@ -109,12 +110,13 @@ steps: sudo dnf makecache - task: Bash@3 displayName: 'Install base dnf packages' + retryCountOnTaskFailure: 3 inputs: targetType: inline script: | - sudo dnf config-manager --set-enabled powertools # rpm fusion free repo for some dependencies - sudo dnf -y install https://download1.rpmfusion.org/free/el/rpmfusion-free-release-8.noarch.rpm + sudo dnf config-manager --set-enabled powertools && \ + sudo dnf -y install https://download1.rpmfusion.org/free/el/rpmfusion-free-release-8.noarch.rpm && \ sudo dnf -y install ${{ join(' ', parameters.basePackages) }} - task: Bash@3 displayName: 'Check gcc environment' @@ -128,6 +130,7 @@ steps: g++ -print-file-name=libstdc++.so - task: Bash@3 displayName: 'Set python 3.11 as default' + retryCountOnTaskFailure: 3 inputs: targetType: inline script: | @@ -142,18 +145,20 @@ steps: - ${{ if eq(pkg, 'ninja-build') }}: - task: Bash@3 displayName: 'Install ninja 1.11.1' + retryCountOnTaskFailure: 3 inputs: targetType: inline script: | - curl -LO https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip - sudo dnf -y install unzip - unzip ninja-linux.zip - sudo mv ninja /usr/local/bin/ninja - sudo chmod +x /usr/local/bin/ninja + sudo dnf -y install unzip && \ + curl -LO https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip && \ + unzip ninja-linux.zip && \ + sudo mv ninja /usr/local/bin/ninja && \ + sudo chmod +x /usr/local/bin/ninja && \ echo "##vso[task.prependpath]/usr/local/bin" - ${{ if ne(parameters.aptToDnfMap[pkg], '') }}: - task: Bash@3 displayName: 'dnf install ${{ parameters.aptToDnfMap[pkg] }}' + retryCountOnTaskFailure: 3 inputs: targetType: inline script: | diff --git a/.azuredevops/templates/steps/dependencies-other.yml b/.azuredevops/templates/steps/dependencies-other.yml index 177bbac8a..b39b32945 100644 --- a/.azuredevops/templates/steps/dependencies-other.yml +++ b/.azuredevops/templates/steps/dependencies-other.yml @@ -27,6 +27,7 @@ steps: - ${{ if gt(length(parameters.pipModules), 0) }}: - task: Bash@3 displayName: 'pip install ...' + retryCountOnTaskFailure: 3 inputs: targetType: inline script: python3 -m pip install -v --force-reinstall ${{ join(' ', parameters.pipModules) }} diff --git a/.azuredevops/templates/steps/dependencies-vendor.yml b/.azuredevops/templates/steps/dependencies-vendor.yml index 571877d1e..8d885b553 100644 --- a/.azuredevops/templates/steps/dependencies-vendor.yml +++ b/.azuredevops/templates/steps/dependencies-vendor.yml @@ -17,6 +17,7 @@ steps: - ${{ each dependency in parameters.dependencyList }}: - task: DownloadPipelineArtifact@2 displayName: Download ${{ dependency }} + retryCountOnTaskFailure: 3 inputs: project: ROCm-CI buildType: specific diff --git a/.azuredevops/templates/steps/local-artifact-download.yml b/.azuredevops/templates/steps/local-artifact-download.yml index 24d00ce0e..d9c9fe328 100644 --- a/.azuredevops/templates/steps/local-artifact-download.yml +++ b/.azuredevops/templates/steps/local-artifact-download.yml @@ -33,6 +33,7 @@ parameters: steps: - task: DownloadPipelineArtifact@2 displayName: Download ${{ parameters.preTargetFilter}}*${{ parameters.os }}_${{ parameters.gpuTarget}}*${{ parameters.postTargetFilter}} + retryCountOnTaskFailure: 3 inputs: ${{ if eq(parameters.buildType, 'specific') }}: buildType: specific diff --git a/.azuredevops/templates/steps/miopen-get-ck-build.yml b/.azuredevops/templates/steps/miopen-get-ck-build.yml index 6c6d44407..03803e3ee 100644 --- a/.azuredevops/templates/steps/miopen-get-ck-build.yml +++ b/.azuredevops/templates/steps/miopen-get-ck-build.yml @@ -7,6 +7,7 @@ steps: - task: Bash@3 name: downloadCKBuild displayName: Download specific CK build + retryCountOnTaskFailure: 3 env: CXX: $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++ CC: $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang