diff --git a/extra/rocm/rdna3/asm.py b/extra/rocm/rdna3/asm.py index aa58c87a48..f980e10fa9 100644 --- a/extra/rocm/rdna3/asm.py +++ b/extra/rocm/rdna3/asm.py @@ -24,8 +24,10 @@ code = open(pathlib.Path(__file__).parent / "prog.s", "r").read() gen = [] FLOPS = 0 -for j in range(4): - for i in range(0, 251, 6): +#MAX_REG = 251 +MAX_REG = 32 +for j in range(1): + for i in range(0, MAX_REG, 6): #gen.append(f"v_dual_fmac_f32 v{i+0}, v{i+1}, v{i+2} :: v_dual_fmac_f32 v{i+3}, v{i+4}, v{i+5}") #FLOPS += 4 gen.append(f"v_dual_dot2acc_f32_f16 v{i+0}, v{i+1}, v{i+2} :: v_dual_dot2acc_f32_f16 v{i+3}, v{i+4}, v{i+5}") @@ -48,9 +50,10 @@ print(colored("creating CLProgram", "green")) prg = CLProgram("code", asm, binary=True) print(colored("running program", "green")) -FLOPS *= 100000*1024*1024 # loop * global_size +G = 256 +FLOPS *= 100000*G*G # loop * global_size for i in range(3): - tm = prg([1024, 1024], [256, 1], buf, wait=True) + tm = prg([G, G], [256, 1], buf, wait=True) print(f"ran in {tm*1e3:.2f} ms, {FLOPS/(tm*1e9):.2f} GFLOPS") print(colored("transferring buffer", "green")) diff --git a/extra/rocm/rdna3/codegen.py b/extra/rocm/rdna3/codegen.py new file mode 100644 index 0000000000..d6efd61b3b --- /dev/null +++ b/extra/rocm/rdna3/codegen.py @@ -0,0 +1,20 @@ +import numpy as np + +from tinygrad.runtime.ops_gpu import CLCodegen +from tinygrad.codegen.assembly import AssemblyCodegen + +from tinygrad.helpers import LazyNumpyArray, dtypes +from tinygrad.ops import LazyOp, BinaryOps +from tinygrad.lazy import LazyBuffer +from tinygrad.shape.shapetracker import ShapeTracker + +ones = LazyNumpyArray.from_np(np.ones((3,), np.float32)) + +#target = "GPU" +target = "RDNA" + +b1 = LazyBuffer.fromCPU(ones, target) +b2 = LazyBuffer.fromCPU(ones, target) + +out = LazyBuffer(target, ShapeTracker((3,)), BinaryOps, LazyOp(BinaryOps.ADD, (b1, b2)), dtypes.float32) +print(out.toCPU()) diff --git a/extra/rocm/rdna3/prog.s b/extra/rocm/rdna3/prog.s index 38efca5407..7a8eec67a2 100644 --- a/extra/rocm/rdna3/prog.s +++ b/extra/rocm/rdna3/prog.s @@ -21,9 +21,6 @@ code.kd: # kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR = 1 # kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_RESERVED1 = 1 .text -.global code -.type code,STT_FUNC -code: # https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state # s[0:1] contains the kernarg_address # TODO: can we use s[2:3] if this was really a wave since we only alloced 2 SGPRs? diff --git a/tinygrad/codegen/assembly.py b/tinygrad/codegen/assembly.py new file mode 100644 index 0000000000..92aff7ac09 --- /dev/null +++ b/tinygrad/codegen/assembly.py @@ -0,0 +1,103 @@ +from tinygrad.codegen.linearizer import Linearizer +from tinygrad.ops import ASTRunner +from tinygrad.runtime.ops_gpu import ROCM_LLVM_PATH + +# ugh, is this really needed? +from extra.helpers import enable_early_exec +early_exec = enable_early_exec() + +# https://github.com/ROCm-Developer-Tools/ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md#initial-kernel-register-state +# enable_sgpr_kernarg_segment_ptr +# enable_sgpr_grid_workgroup_count_X + +# amd_kernel_..., amd_machine_... +# kernel_code_entry_byte_offset, kernel_code_prefetch_byte_offset +# kernel_code_prefetch_byte_size, max_scratch_backing_memory_byte_size +# compute_pgm_rsrc1, compute_pgm_rsrc2, kernel_code_properties, workitem_private_segment_byte_size + +# TODO: generate this struct +boilerplate_start = """ +.global _start +_start: +.rodata +.align 0x10 +.global code.kd +.type code.kd,STT_OBJECT +code.kd: +.long 0,0,0,0 +.long 0x00000bc0,0x00000000,0x00000000,0x00000000 +.long 0,0,0,0 +.long 0x60af0000,0x0000009e,0x00000408,0x00000000 +.text +""" + +# TODO: generate this yaml +boilerplate_end = """ +.amdgpu_metadata +amdhsa.kernels: + - .args: + - .address_space: global + .name: a + .offset: 0 + .size: 8 + .type_name: 'float*' + .value_kind: global_buffer + - .address_space: global + .name: b + .offset: 0 + .size: 8 + .type_name: 'float*' + .value_kind: global_buffer + - .address_space: global + .name: c + .offset: 0 + .size: 8 + .type_name: 'float*' + .value_kind: global_buffer + .group_segment_fixed_size: 0 + .kernarg_segment_align: 8 + .kernarg_segment_size: 8 + .language: OpenCL C + .language_version: + - 1 + - 2 + .max_flat_workgroup_size: 256 + .name: code + .private_segment_fixed_size: 0 + .sgpr_count: 2 + .sgpr_spill_count: 0 + .symbol: code.kd + .uses_dynamic_stack: false + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 32 +amdhsa.target: amdgcn-amd-amdhsa--gfx1100 +amdhsa.version: + - 1 + - 2 +.end_amdgpu_metadata +""" + +class AssemblyCodegen(Linearizer): + supports_float4: bool = True + + # s registers are the addresses and non local indexes + def codegen(self): + self.process() + self.hand_coded_optimizations() + self.linearize() + + instructions = [] + + # exit asm + instructions += ['s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)', 's_endpgm', 's_code_end'] + + code = boilerplate_start + '\n'.join(instructions) + boilerplate_end + object = early_exec(([ROCM_LLVM_PATH / "llvm-mc", '--arch=amdgcn', '--mcpu=gfx1100', '--triple=amdgcn-amd-amdhsa', '--filetype=obj', '-'], code.encode("utf-8"))) + asm = early_exec(([ROCM_LLVM_PATH / "ld.lld", "/dev/stdin", "-o", "/dev/stdout", "--pie"], object)) + + global_size = [] + local_size = [] + return ASTRunner('code', asm, + global_size[::-1] if len(global_size) else [1], local_size[::-1] if len(local_size) else None, + op_estimate=self.info.flops, mem_estimate=self.mem_estimate, display_name=self.function_name, runtime_args={"binary": True}) diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 3b10e512f9..2b7023fd54 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -46,7 +46,8 @@ class LazyNumpyArray: def reshape(self, new_shape): return LazyNumpyArray(self.fxn, new_shape, self.dtype) def copy(self): return self if callable(self.fxn) else LazyNumpyArray(self.fxn, self.shape, self.dtype) def astype(self, typ): return LazyNumpyArray(self.fxn, self.shape, typ) - + @staticmethod + def from_np(data): return LazyNumpyArray(data, data.shape, data.dtype) @dataclass class dtypes: diff --git a/tinygrad/ops.py b/tinygrad/ops.py index 4199c99f11..bde25bfc33 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -78,12 +78,12 @@ def get_lazyop_info(ast:LazyOp) -> FlopCounter: return InterpretedFlopCounter.ex # **************** for Compiled Buffers **************** class ASTRunner: - def __init__(self, name, prg, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, op_estimate=0, mem_estimate=0, display_name:Optional[str]=None): - if DEBUG >= 4: print(prg) - self.name, self.prg, self.global_size, self.local_size, self.op_estimate, self.mem_estimate, self.display_name = name, prg, global_size, local_size, op_estimate, mem_estimate, display_name + def __init__(self, name, prg, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, op_estimate=0, mem_estimate=0, display_name:Optional[str]=None, runtime_args={}): + if DEBUG >= 4 and 'binary' not in runtime_args: print(prg) + self.name, self.prg, self.global_size, self.local_size, self.op_estimate, self.mem_estimate, self.display_name, self.runtime_args = name, prg, global_size, local_size, op_estimate, mem_estimate, display_name, runtime_args def build(self, runtime): - self.clprg = runtime(self.name, self.prg) + self.clprg = runtime(self.name, self.prg, **self.runtime_args) return self def exec(self, bufs) -> Optional[float]: diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index c4ac38ece7..3f8ca45566 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -62,6 +62,8 @@ class CLProgram: from disassemblers.adreno import disasm disasm(self.binary()) elif 'gfx1100' in CL.cl_ctx.devices[0].name: + # NOTE: this can move, you have to read the ELF + #print(','.join([hex(x) for x in struct.unpack("I"*0x10, self.binary()[0x800:0x840])])) asm = early_exec(([ROCM_LLVM_PATH / "llvm-objdump", '-d', '-'], self.binary())) print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x])) else: diff --git a/tinygrad/runtime/ops_rdna.py b/tinygrad/runtime/ops_rdna.py new file mode 100644 index 0000000000..04faa755b2 --- /dev/null +++ b/tinygrad/runtime/ops_rdna.py @@ -0,0 +1,5 @@ +from tinygrad.ops import Compiled +from tinygrad.codegen.assembly import AssemblyCodegen +from tinygrad.runtime.ops_gpu import CLBuffer, CLProgram, CL + +RDNABuffer = Compiled(CLBuffer, AssemblyCodegen, CLProgram, CL.synchronize) diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index d49c1ab06d..8723d48d92 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -42,7 +42,7 @@ class Tensor: data = data.realize().toCPU() # all ndarrays are lazy now - if isinstance(data, np.ndarray): data = LazyNumpyArray(data, data.shape, data.dtype) + if isinstance(data, np.ndarray): data = LazyNumpyArray.from_np(data) # by here, it's either LazyNumpyArray or LazyBuffer # TODO: it should all be LazyBuffer I think