* try am in ci

* no sudo

* temp

* run more am test

* run half on am

* insert amdgpu

* other machine as well
This commit is contained in:
nimlgen
2025-01-13 19:55:17 +03:00
committed by GitHub
parent d224d0ed7f
commit 74b83c4c41

View File

@@ -324,6 +324,8 @@ jobs:
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Insert amdgpu
run: sudo modprobe amdgpu
- name: Symlink models and datasets
run: |
mkdir -p weights
@@ -369,6 +371,8 @@ jobs:
# TODO: AMD compiler bug causes this to fail
#- name: Fuzz Padded Tensor Core GEMM
# run: HSA=1 M_START=12 M_STOP=20 M_STEP=1 N_START=12 N_STOP=20 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 DEBUG=2 python3 ./extra/gemm/fuzz_matmul.py
- name: Remove amdgpu
run: sudo rmmod amdgpu
- name: Run Stable Diffusion
run: AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
- name: Run SDXL
@@ -389,6 +393,8 @@ jobs:
run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
- name: Run LLaMA-3 8B on 6 GPUs
run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
- name: Restore amdgpu
run: sudo modprobe amdgpu
- name: Run LLaMA-2 70B
run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt
- name: Run Mixtral 8x7B
@@ -437,6 +443,8 @@ jobs:
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Insert amdgpu
run: sudo modprobe amdgpu
- name: Symlink models and datasets
run: |
mkdir -p weights