diff --git a/extra/helpers.py b/extra/helpers.py index 052ce1a132..5ed3f9cf35 100644 --- a/extra/helpers.py +++ b/extra/helpers.py @@ -6,3 +6,18 @@ class Timing(object): def __exit__(self, exc_type, exc_val, exc_tb): self.et = time.perf_counter_ns() - self.st if self.enabled: print(f"{self.prefix}{self.et*1e-6:.2f} ms"+(self.on_exit(self.et) if self.on_exit else "")) + +def enable_early_exec(): + import subprocess, multiprocessing + qin, qout = multiprocessing.Queue(), multiprocessing.Queue() + def _early_exec_process(qin, qout): + while 1: + path, inp = qin.get() + qout.put(subprocess.check_output(path, input=inp)) + p = multiprocessing.Process(target=_early_exec_process, args=(qin, qout)) + p.daemon = True + p.start() + def early_exec(x): + qin.put(x) + return qout.get() + return early_exec diff --git a/extra/rocm/sniffer/sniff.cc b/extra/rocm/sniffer/sniff.cc index a594ecd456..392932956d 100644 --- a/extra/rocm/sniffer/sniff.cc +++ b/extra/rocm/sniffer/sniff.cc @@ -72,8 +72,14 @@ static void handler(int sig, siginfo_t *si, void *unused) { D("HSA_PACKET_TYPE_KERNEL_DISPATCH -- setup:%d workgroup[%d, %d, %d] grid[%d, %d, %d] kernel_object:0x%lx kernarg_address:%p\n", pkt->setup, pkt->workgroup_size_x, pkt->workgroup_size_y, pkt->workgroup_size_z, pkt->grid_size_x, pkt->grid_size_y, pkt->grid_size_z, pkt->kernel_object, pkt->kernarg_address); amd_kernel_code_t *code = (amd_kernel_code_t *)pkt->kernel_object; D("kernel_code_entry_byte_offset:%lx\n", code->kernel_code_entry_byte_offset); - hexdump((void*)(pkt->kernel_object + code->kernel_code_entry_byte_offset), 0x200); - //hexdump((void*)pkt->kernel_object, sizeof(amd_kernel_code_t)); + uint32_t *kernel_code = (uint32_t*)(pkt->kernel_object + code->kernel_code_entry_byte_offset); + int code_len = 0; + while (kernel_code[code_len] != 0xbf9f0000 && kernel_code[code_len] != 0) code_len++; + hexdump(kernel_code, code_len*4); + /*FILE *f = fopen("/tmp/kernel_code", "wb"); + fwrite(kernel_code, 4, code_len, f); + fclose(f); + system("python -c 'print(\" \".join([(\"0x%02X\"%x) for x in open(\"/tmp/kernel_code\", \"rb\").read()]))' | ../build/llvm-project/bin/llvm-mc --disassemble --arch=amdgcn --mcpu=gfx1100 --show-encoding");*/ } else if ((pkt->header&0xFF) == HSA_PACKET_TYPE_BARRIER_AND) { D("HSA_PACKET_TYPE_BARRIER_AND\n"); } diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index 4dfda730f2..9cb9aa76b1 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -1,5 +1,5 @@ from __future__ import annotations -import platform +import platform, pathlib import numpy as np import pyopencl as cl # type: ignore from typing import Optional, List @@ -12,6 +12,11 @@ OSX = platform.system() == "Darwin" OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something FLOAT16 = getenv("FLOAT16", 0) +# TODO: if you fork and exit the child process after creating anything with cl on AMD, it hangs on e.wait() +if DEBUG >= 5: + from extra.helpers import enable_early_exec + early_exec = enable_early_exec() + class _CL: def __init__(self): platforms: List[List[cl.Device]] = [y for y in ([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()] + [x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()]) if len(y)] @@ -54,6 +59,9 @@ class CLProgram: if 'Adreno' in CL.cl_ctx.devices[0].name: from disassemblers.adreno import disasm disasm(self.binary()) + elif 'gfx1100' in CL.cl_ctx.devices[0].name: + asm = early_exec(([pathlib.Path(__file__).parent.parent.parent / "extra/rocm/build/llvm-project/bin/llvm-objdump", '-d', '-'], self.binary())) + print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x])) else: # print the PTX for NVIDIA. TODO: probably broken for everything else print(self.binary().decode('utf-8'))