mmapeak: gfx1103 support

This commit is contained in:
George Hotz
2025-10-19 16:57:28 +08:00
parent 617614beb7
commit 89e7f2fa00
2 changed files with 5 additions and 4 deletions

View File

@@ -9,7 +9,7 @@ WAVE_SIZE = 32
NUM_WAVES = 2
FLOPS_PER_MATMUL = 16*16*16*2
INTERNAL_LOOP = 1_000_00
INSTRUCTIONS_PER_LOOP = 1000
INSTRUCTIONS_PER_LOOP = 200
DIRECTIVE = ".amdhsa_wavefront_size32 1"
assemblyTemplate = (pathlib.Path(__file__).parent / "template.s").read_text()
@@ -33,7 +33,7 @@ def launchBenchmark(instruction, vgprIndices, dense=True, accum=False, extra="")
lib = COMPILER.compile(src)
fxn = AMDProgram(DEV, "matmul", lib)
start = time.perf_counter()
# TODO: why?
# TODO: why is this elapsed wrong?
elapsed = fxn(global_size=(NUM_WORKGROUPS,1,1), local_size=(WAVE_SIZE*NUM_WAVES,1,1), wait=True) #For some reason the returned time is very small after the first kernel execution
end = time.perf_counter()
elapsed = end-start
@@ -48,7 +48,9 @@ if __name__=="__main__":
raise RuntimeError("Error while initiating AMD device")
COMPILER = HIPCompiler(DEV.arch)
if DEV.arch == 'gfx1100':
if DEV.arch in {'gfx1100', 'gfx1103'}:
if DEV.arch == 'gfx1103':
NUM_WORKGROUPS = 8
launchBenchmark("v_wmma_bf16_16x16x16_bf16", (7,8,15))
launchBenchmark("v_wmma_f16_16x16x16_f16", (7,8,15))
launchBenchmark("v_wmma_f32_16x16x16_bf16", (7,8,15))

View File

@@ -36,6 +36,5 @@ amdhsa.kernels:
.sgpr_count: 8
.vgpr_count: 32
.max_flat_workgroup_size: 1024
.args:
...
.end_amdgpu_metadata