mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-10 23:48:01 -05:00
add support for "shared_mem" for LLVM (#10093)
* init llvm shared * add test_tensor_cores_emulation run for llvm
This commit is contained in:
3
.github/workflows/benchmark.yml
vendored
3
.github/workflows/benchmark.yml
vendored
@@ -63,11 +63,10 @@ jobs:
|
||||
run: BIG=2 MPS=1 python3.11 test/test_speed_v_torch.py | tee torch_speed.txt
|
||||
- name: Test tensor cores
|
||||
run: METAL=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
# TODO: add TestLinearizer.test_tensor_cores_emulation for llvm (#10093)
|
||||
- name: Test AMX tensor cores
|
||||
run: |
|
||||
DEBUG=2 CPU=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
DEBUG=2 LLVM=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
DEBUG=2 LLVM=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
- name: Run Tensor Core GEMM (float)
|
||||
run: DEBUG=2 SHOULD_USE_TC=1 python3.11 extra/gemm/simple_matmul.py | tee matmul.txt
|
||||
- name: Run Tensor Core GEMM (half)
|
||||
|
||||
@@ -8,7 +8,7 @@ from tinygrad.helpers import prod, AMX
|
||||
|
||||
def ldt(dt:DType):
|
||||
if dt.vcount > 1: return f"<{dt.vcount} x {ldt(dt.scalar())}>"
|
||||
if isinstance(dt, PtrDType): return ldt(dt.base) + (" addrspace(3)*" if dt.local else "*")
|
||||
if isinstance(dt, PtrDType): return ldt(dt.base) + "*"
|
||||
return {dtypes.void: "void", dtypes.bool: "i1", dtypes.int8: "i8", dtypes.int16: "i16", dtypes.int32: "i32", dtypes.int64: "i64",
|
||||
dtypes.uint8: "i8", dtypes.uint16: "i16", dtypes.uint32: "i32", dtypes.uint64: "i64",
|
||||
dtypes.float16: "half", dtypes.bfloat16: "bfloat", dtypes.float32: "float", dtypes.float64: "double"}[dt]
|
||||
@@ -107,6 +107,8 @@ base_rewrite = PatternMatcher([
|
||||
# if
|
||||
(UPat(Ops.IF, name="x"), lambda ctx,x: f" br i1 {ctx[x.src[0]]}, label %ifbody_{ctx[x][1:]}, label %ifskip_{ctx[x][1:]}\nifbody_{ctx[x][1:]}:"),
|
||||
(UPat(Ops.ENDIF, name="x"), lambda ctx,x: f" br label %ifskip_{ctx[x.src[0]][1:]}\nifskip_{ctx[x.src[0]][1:]}:"),
|
||||
|
||||
(UPat(Ops.BARRIER), lambda ctx: "")
|
||||
])
|
||||
|
||||
def llvm_bf16_cast(buf:UOp, idx:UOp, root:UOp):
|
||||
@@ -118,7 +120,6 @@ class LLVMRenderer(Renderer):
|
||||
abi = 'win64cc' if sys.platform == 'win32' else None
|
||||
supports_float4 = True
|
||||
has_local = False
|
||||
has_shared = False
|
||||
global_max: tuple[int, ...] | None = None
|
||||
string_rewrite = base_rewrite + PatternMatcher([(UPat(Ops.WMMA, name="wmma"), render_wmma_amx)])
|
||||
if AMX: tensor_cores = ClangRenderer.amx_tc
|
||||
@@ -172,9 +173,12 @@ class LLVMRenderer(Renderer):
|
||||
# NOTE: MallocAllocator promises 0x20 alignment
|
||||
args.append(f"{ldt(u.dtype)}{' noalias align 32' if isinstance(u.dtype, PtrDType) else ''} {r[u]}")
|
||||
elif u.op == Ops.DEFINE_LOCAL:
|
||||
r[u] = f"@local_{u.arg}"
|
||||
r[u] = f"%local_{u.arg}"
|
||||
assert isinstance(u.dtype, PtrDType)
|
||||
local_args.append(f"{r[u]} = internal unnamed_addr addrspace(3) global [{u.dtype.size} x {ldt(u.dtype)}] undef, align 16")
|
||||
if self.device == "LLVM": kernel.append(f" {r[u]} = alloca [{u.dtype.size} x {ldt(u.dtype)}], align 16")
|
||||
else:
|
||||
local_args.append(f"@{r[u][1:]} = internal unnamed_addr addrspace(3) global [{u.dtype.size} x {ldt(u.dtype)}] undef, align 16")
|
||||
kernel.append(f" {r[u]} = addrspacecast [{u.dtype.size} x {ldt(u.dtype)}] addrspace(3)* @{r[u][1:]} to [{u.dtype.size} x {ldt(u.dtype)}]*")
|
||||
elif u.op is Ops.ASSIGN: pass # assign is already handled by the first pass
|
||||
elif u.op is Ops.DEFINE_ACC: r[u] = r[u.src[0]] # a define acc can be used and never be assigned to
|
||||
elif u.op is Ops.CONST: r[u] = lconst(u.arg, u.dtype)
|
||||
@@ -215,7 +219,6 @@ code_for_workitem = {"g": lambda x: f"tail call i32 @llvm.amdgcn.workgroup.id.{c
|
||||
class AMDLLVMRenderer(LLVMRenderer):
|
||||
device = "AMD"
|
||||
has_local = True
|
||||
has_shared = True
|
||||
shared_max = AMDRenderer.shared_max
|
||||
global_max = AMDRenderer.global_max
|
||||
abi = "amdgpu_kernel"
|
||||
|
||||
Reference in New Issue
Block a user