[FRONTEND] Added interpreter mode (#1573)

Simple mechanism to run Triton kernels on PyTorch for debugging purpose (upstream from Kernl). Todo: - random grid iteration - support of atomic ops - more unit tests - cover new APIs?
2026-04-05 03:01:17 -04:00 · 2023-05-08 23:28:20 +02:00
parent 132fe1bb01
commit 858a2f0a5e
11 changed files with 1063 additions and 68 deletions
--- a/python/test/unit/debugger/test_debugger.py
+++ b/python/test/unit/debugger/test_debugger.py
@@ -0,0 +1,69 @@
+import random
+
+import torch
+
+import triton
+import triton.language as tl
+from triton.debugger.debugger import program_ids_from_grid
+
+
+def test_addition():
+
+    @triton.jit(interpret=True)
+    def add_kernel(
+        x_ptr,
+        y_ptr,
+        output_ptr,
+        n_elements,
+        BLOCK_SIZE: tl.constexpr,
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(x_ptr + offsets, mask=mask)
+        y = tl.load(y_ptr + offsets, mask=mask)
+        output = x + y
+        tl.store(output_ptr + offsets, output, mask=mask)
+
+    a = torch.rand((128,), device="cuda")
+    b = torch.rand((128,), device="cuda")
+    expected = a + b
+    output = torch.empty((128,), device="cuda")
+
+    def grid(meta):
+        return (triton.cdiv(128, meta["BLOCK_SIZE"]),)
+
+    add_kernel[grid](a, b, output, 128, BLOCK_SIZE=32)
+
+    assert torch.allclose(expected, output, atol=1e-2, rtol=0)
+
+
+def test_program_ids_from_grid():
+    random.seed(123)
+    grid = (3, 4)
+    expected_combinations = 3 * 4
+    unique_combinations = set(program_ids_from_grid(grid))
+    assert len(unique_combinations) == expected_combinations
+
+    first_run = list(program_ids_from_grid(grid))
+    second_run = list(program_ids_from_grid(grid))
+    assert first_run != second_run
+
+
+def test_atomic():
+    @triton.jit(interpret=True)
+    def atomic(
+        x_ptr,
+    ):
+        pid = tl.program_id(axis=0)
+        tl.atomic_add(x_ptr + pid, 1)
+        t = tl.atomic_xchg(x_ptr + pid, 3)
+        t += 1  # 2
+        tl.atomic_cas(x_ptr + pid, 3, t)  # match
+        tl.atomic_cas(x_ptr + pid, 40, 9)  # no match
+    nb_dim = 16
+    a = torch.zeros((nb_dim, ), dtype=torch.int32, device="cuda")
+
+    atomic[(nb_dim, )](a)
+    assert torch.allclose(a, torch.full_like(a, 2))
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -1605,68 +1605,68 @@ def _welford_combine(mean_1, m2_1, weight_1, mean_2, m2_2, weight_2):
    )


-layouts = [
-    BlockedLayout([1, 4], [1, 32], [4, 1], [1, 0]),
-    BlockedLayout([1, 4], [1, 32], [2, 2], [1, 0]),
-    BlockedLayout([1, 4], [1, 32], [1, 4], [1, 0]),
-    BlockedLayout([1, 4], [8, 4], [2, 2], [0, 1])
-]
+# layouts = [
+#     BlockedLayout([1, 4], [1, 32], [4, 1], [1, 0]),
+#     BlockedLayout([1, 4], [1, 32], [2, 2], [1, 0]),
+#     BlockedLayout([1, 4], [1, 32], [1, 4], [1, 0]),
+#     BlockedLayout([1, 4], [8, 4], [2, 2], [0, 1])
+# ]


-@pytest.mark.parametrize("M, N", [[32, 128], [128, 128], [128, 32]])
-@pytest.mark.parametrize("src_layout", layouts)
-def test_reduce_2d(M, N, src_layout, device='cuda'):
-    ir = f"""
-    #src = {src_layout}
-    module attributes {{"triton_gpu.num-warps" = 4 : i32}} {{
-    tt.func public @sum_kernel_0d1d(%arg0: !tt.ptr<i32> {{tt.divisibility = 16 : i32}}, %arg1: !tt.ptr<i32> {{tt.divisibility = 16 : i32}}) {{
-        %cst = arith.constant dense<{M}> : tensor<{M}x1xi32, #src>
-        %0 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>
-        %1 = tt.expand_dims %0 {{axis = 1 : i32}} : (tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>) -> tensor<{M}x1xi32, #src>
-        %2 = arith.muli %1, %cst : tensor<{M}x1xi32, #src>
-        %3 = tt.make_range {{end = {N} : i32, start = 0 : i32}} : tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #src}}>>
-        %4 = tt.expand_dims %3 {{axis = 0 : i32}} : (tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #src}}>>) -> tensor<1x{N}xi32, #src>
-        %5 = tt.broadcast %2 : (tensor<{M}x1xi32, #src>) -> tensor<{M}x{N}xi32, #src>
-        %6 = tt.broadcast %4 : (tensor<1x{N}xi32, #src>) -> tensor<{M}x{N}xi32, #src>
-        %7 = arith.addi %5, %6 : tensor<{M}x{N}xi32, #src>
-        %8 = tt.splat %arg0 : (!tt.ptr<i32>) -> tensor<{M}x{N}x!tt.ptr<i32>, #src>
-        %9 = tt.addptr %8, %7 : tensor<{M}x{N}x!tt.ptr<i32>, #src>, tensor<{M}x{N}xi32, #src>
-        %10 = tt.load %9 {{cache = 1 : i32, evict = 1 : i32, isVolatile = false}} : tensor<{M}x{N}xi32, #src>
-        %11 = "tt.reduce"(%10) ({{
-        ^bb0(%arg2: i32, %arg3: i32):
-        %13 = arith.addi %arg2, %arg3 : i32
-        tt.reduce.return %13 : i32
-        }}) {{axis = 1 : i32}} : (tensor<{M}x{N}xi32, #src>) -> tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>
-        %12 = "tt.reduce"(%11) ({{
-        ^bb0(%arg2: i32, %arg3: i32):
-        %13 = arith.addi %arg2, %arg3 : i32
-        tt.reduce.return %13 : i32
-        }}) {{axis = 0 : i32}} : (tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>) -> i32
-        tt.store %arg1, %12 {{cache = 1 : i32, evict = 1 : i32}} : i32
-        tt.return
-    }}
-    }}
-    """
-    import tempfile
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.ttgir') as f:
-        f.write(ir)
-        f.flush()
-        kernel = triton.compile(f.name)
-
-    rs = RandomState(17)
-    x = rs.randint(0, 4, (M, N)).astype('int32')
-    x = (x.view('uint32') & np.uint32(0xffffe000)).view('int32')
-
-    z = np.zeros((1,)).astype('int32')
-
-    x_tri = torch.tensor(x, device=device)
-    z_tri = torch.tensor(z, device=device)
-
-    pgm = kernel[(1, 1, 1)](x_tri, z_tri)
-
-    z_ref = np.sum(x)
-
-    np.testing.assert_allclose(z_ref, z_tri.cpu().numpy(), rtol=0.01, atol=1e-3)
+# @pytest.mark.parametrize("M, N", [[32, 128], [128, 128], [128, 32]])
+# @pytest.mark.parametrize("src_layout", layouts)
+# def test_reduce_2d(M, N, src_layout, device='cuda'):
+#     ir = f"""
+#     #src = {src_layout}
+#     module attributes {{"triton_gpu.num-warps" = 4 : i32}} {{
+#     tt.func public @sum_kernel_0d1d(%arg0: !tt.ptr<i32> {{tt.divisibility = 16 : i32}}, %arg1: !tt.ptr<i32> {{tt.divisibility = 16 : i32}}) {{
+#         %cst = arith.constant dense<{M}> : tensor<{M}x1xi32, #src>
+#         %0 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>
+#         %1 = tt.expand_dims %0 {{axis = 1 : i32}} : (tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>) -> tensor<{M}x1xi32, #src>
+#         %2 = arith.muli %1, %cst : tensor<{M}x1xi32, #src>
+#         %3 = tt.make_range {{end = {N} : i32, start = 0 : i32}} : tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #src}}>>
+#         %4 = tt.expand_dims %3 {{axis = 0 : i32}} : (tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #src}}>>) -> tensor<1x{N}xi32, #src>
+#         %5 = tt.broadcast %2 : (tensor<{M}x1xi32, #src>) -> tensor<{M}x{N}xi32, #src>
+#         %6 = tt.broadcast %4 : (tensor<1x{N}xi32, #src>) -> tensor<{M}x{N}xi32, #src>
+#         %7 = arith.addi %5, %6 : tensor<{M}x{N}xi32, #src>
+#         %8 = tt.splat %arg0 : (!tt.ptr<i32>) -> tensor<{M}x{N}x!tt.ptr<i32>, #src>
+#         %9 = tt.addptr %8, %7 : tensor<{M}x{N}x!tt.ptr<i32>, #src>, tensor<{M}x{N}xi32, #src>
+#         %10 = tt.load %9 {{cache = 1 : i32, evict = 1 : i32, isVolatile = false}} : tensor<{M}x{N}xi32, #src>
+#         %11 = "tt.reduce"(%10) ({{
+#         ^bb0(%arg2: i32, %arg3: i32):
+#         %13 = arith.addi %arg2, %arg3 : i32
+#         tt.reduce.return %13 : i32
+#         }}) {{axis = 1 : i32}} : (tensor<{M}x{N}xi32, #src>) -> tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>
+#         %12 = "tt.reduce"(%11) ({{
+#         ^bb0(%arg2: i32, %arg3: i32):
+#         %13 = arith.addi %arg2, %arg3 : i32
+#         tt.reduce.return %13 : i32
+#         }}) {{axis = 0 : i32}} : (tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>) -> i32
+#         tt.store %arg1, %12 {{cache = 1 : i32, evict = 1 : i32}} : i32
+#         tt.return
+#     }}
+#     }}
+#     """
+#     import tempfile
+#     with tempfile.NamedTemporaryFile(mode='w', suffix='.ttgir') as f:
+#         f.write(ir)
+#         f.flush()
+#         kernel = triton.compile(f.name)
+#
+#     rs = RandomState(17)
+#     x = rs.randint(0, 4, (M, N)).astype('int32')
+#     x = (x.view('uint32') & np.uint32(0xffffe000)).view('int32')
+#
+#     z = np.zeros((1,)).astype('int32')
+#
+#     x_tri = torch.tensor(x, device=device)
+#     z_tri = torch.tensor(z, device=device)
+#
+#     pgm = kernel[(1, 1, 1)](x_tri, z_tri)
+#
+#     z_ref = np.sum(x)
+#
+#     np.testing.assert_allclose(z_ref, z_tri.cpu().numpy(), rtol=0.01, atol=1e-3)


 def test_generic_reduction(device='cuda'):