add support for "shared_mem" for LLVM (#10093)

* init llvm shared * add test_tensor_cores_emulation run for llvm
2026-01-10 23:48:01 -05:00 · 2025-04-29 09:56:36 -03:00
parent ad7546c931
commit 58cf8cd493
2 changed files with 9 additions and 7 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -63,11 +63,10 @@ jobs:
      run: BIG=2 MPS=1 python3.11 test/test_speed_v_torch.py | tee torch_speed.txt
    - name: Test tensor cores
      run: METAL=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
-    # TODO: add TestLinearizer.test_tensor_cores_emulation for llvm (#10093)
    - name: Test AMX tensor cores
      run: |
        DEBUG=2 CPU=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
-        DEBUG=2 LLVM=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
+        DEBUG=2 LLVM=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
    - name: Run Tensor Core GEMM (float)
      run: DEBUG=2 SHOULD_USE_TC=1 python3.11 extra/gemm/simple_matmul.py | tee matmul.txt
    - name: Run Tensor Core GEMM (half)
--- a/tinygrad/renderer/llvmir.py
+++ b/tinygrad/renderer/llvmir.py
@@ -8,7 +8,7 @@ from tinygrad.helpers import prod, AMX

 def ldt(dt:DType):
  if dt.vcount > 1: return f"<{dt.vcount} x {ldt(dt.scalar())}>"
-  if isinstance(dt, PtrDType): return ldt(dt.base) + (" addrspace(3)*" if dt.local else "*")
+  if isinstance(dt, PtrDType): return ldt(dt.base) + "*"
  return {dtypes.void: "void", dtypes.bool: "i1", dtypes.int8: "i8", dtypes.int16: "i16", dtypes.int32: "i32", dtypes.int64: "i64",
          dtypes.uint8: "i8", dtypes.uint16: "i16", dtypes.uint32: "i32", dtypes.uint64: "i64",
          dtypes.float16: "half", dtypes.bfloat16: "bfloat", dtypes.float32: "float", dtypes.float64: "double"}[dt]
@@ -107,6 +107,8 @@ base_rewrite = PatternMatcher([
  # if
  (UPat(Ops.IF, name="x"), lambda ctx,x: f"  br i1 {ctx[x.src[0]]}, label %ifbody_{ctx[x][1:]}, label %ifskip_{ctx[x][1:]}\nifbody_{ctx[x][1:]}:"),
  (UPat(Ops.ENDIF, name="x"), lambda ctx,x: f"  br label %ifskip_{ctx[x.src[0]][1:]}\nifskip_{ctx[x.src[0]][1:]}:"),
+
+  (UPat(Ops.BARRIER), lambda ctx: "")
 ])

 def llvm_bf16_cast(buf:UOp, idx:UOp, root:UOp):
@@ -118,7 +120,6 @@ class LLVMRenderer(Renderer):
  abi = 'win64cc' if sys.platform == 'win32' else None
  supports_float4 = True
  has_local = False
-  has_shared = False
  global_max: tuple[int, ...] | None = None
  string_rewrite = base_rewrite + PatternMatcher([(UPat(Ops.WMMA, name="wmma"), render_wmma_amx)])
  if AMX: tensor_cores = ClangRenderer.amx_tc
@@ -172,9 +173,12 @@ class LLVMRenderer(Renderer):
        # NOTE: MallocAllocator promises 0x20 alignment
        args.append(f"{ldt(u.dtype)}{' noalias align 32' if isinstance(u.dtype, PtrDType) else ''} {r[u]}")
      elif u.op == Ops.DEFINE_LOCAL:
-        r[u] = f"@local_{u.arg}"
+        r[u] = f"%local_{u.arg}"
        assert isinstance(u.dtype, PtrDType)
-        local_args.append(f"{r[u]} = internal unnamed_addr addrspace(3) global [{u.dtype.size} x {ldt(u.dtype)}] undef, align 16")
+        if self.device == "LLVM": kernel.append(f"  {r[u]} = alloca [{u.dtype.size} x {ldt(u.dtype)}], align 16")
+        else:
+          local_args.append(f"@{r[u][1:]} = internal unnamed_addr addrspace(3) global [{u.dtype.size} x {ldt(u.dtype)}] undef, align 16")
+          kernel.append(f"  {r[u]} = addrspacecast [{u.dtype.size} x {ldt(u.dtype)}] addrspace(3)* @{r[u][1:]} to [{u.dtype.size} x {ldt(u.dtype)}]*")
      elif u.op is Ops.ASSIGN: pass  # assign is already handled by the first pass
      elif u.op is Ops.DEFINE_ACC: r[u] = r[u.src[0]]  # a define acc can be used and never be assigned to
      elif u.op is Ops.CONST: r[u] = lconst(u.arg, u.dtype)
@@ -215,7 +219,6 @@ code_for_workitem = {"g": lambda x: f"tail call i32 @llvm.amdgcn.workgroup.id.{c
 class AMDLLVMRenderer(LLVMRenderer):
  device = "AMD"
  has_local = True
-  has_shared = True
  shared_max = AMDRenderer.shared_max
  global_max = AMDRenderer.global_max
  abi = "amdgpu_kernel"