amd: apply relocs for kernel_code_entry_byte_offset for AMD_LLVM (#11816)

* amd: apply relocs for kernel_code_entry_byte_offset for AMD_LLVM * fix
2026-04-07 03:00:26 -04:00 · 2025-08-24 17:48:40 +03:00
parent 44bc7dc73d
commit d71444857e
1 changed files with 8 additions and 4 deletions
--- a/tinygrad/runtime/ops_amd.py
+++ b/tinygrad/runtime/ops_amd.py
@@ -427,13 +427,18 @@ class AMDProgram(HCQProgram):
    self.dev, self.name, self.lib = dev, name, lib

    image, sections, _ = elf_loader(self.lib)
-    self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), buf_spec:=BufferSpec(cpu_access=True, nolru=True))
-    self.dev.allocator._copyin(self.lib_gpu, image)
-    self.dev.synchronize()

    rodata_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".rodata"), -1)
    text_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".text"), -1)
    assert rodata_entry >= 0 and text_entry >= 0, ".text or .rodata section not found"
+
+    # Relo for kernel_code_entry_byte_offset for AMD_LLVM. Comgr doesn't need that, but keep shared code path.
+    image[rodata_entry+0x10:rodata_entry+0x10+8] = struct.pack('<q', text_entry - rodata_entry)
+
+    self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), buf_spec:=BufferSpec(cpu_access=True, nolru=True))
+    self.dev.allocator._copyin(self.lib_gpu, image)
+    self.dev.synchronize()
+
    self.group_segment_size = image[rodata_entry:rodata_entry+4].cast("I")[0]
    self.private_segment_size = image[rodata_entry+4:rodata_entry+8].cast("I")[0]
    self.kernargs_segment_size = image[rodata_entry+8:rodata_entry+12].cast("I")[0]
@@ -452,7 +457,6 @@ class AMDProgram(HCQProgram):
    self.rsrc2: int = code.compute_pgm_rsrc2 | (lds_size << 15)
    self.rsrc3: int = image[rodata_entry+44:rodata_entry+48].cast("I")[0] # NOTE: kernel descriptor, not in amd_kernel_code_t struct
    self.prog_addr: int = self.lib_gpu.va_addr + rodata_entry + code.kernel_code_entry_byte_offset
-    if code.kernel_code_entry_byte_offset == 0: self.prog_addr = self.lib_gpu.va_addr + text_entry
    # Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
    # The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
    self.enable_dispatch_ptr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR