[FRONTEND][BACKEND][OPTIMIZER] Loops now use 64-bit indices when necessary (#1261)

* Frontend: - `int` kernel arguments are always signed - Loop induction variable is now determine by integer promotion on lb/ub/step * Optimizer: - Added new ExtractSliceOp that enforces 32-bit offsets * Backend: - Use 64-bit indices when lowering functions and control flow - Removed `idx_val` macro and replaced it with `i32_val` - Cleaned up comments - Added new ArithToIndex pass to make sure operations on indices are done with the `index` dialect, that gets converted to LLVM separately using a 64-bit target
2026-04-05 03:01:17 -04:00 · 2023-03-01 23:09:48 -08:00
parent 90fcb38c7b
commit fa0fbc937f
32 changed files with 441 additions and 248 deletions
--- a/test/Analysis/test-alias.mlir
+++ b/test/Analysis/test-alias.mlir
@@ -93,9 +93,9 @@ func.func @insert_slice(%A : !tt.ptr<f16>, %i1 : i1) {
 func.func @extract_slice(%A : !tt.ptr<f16>) {
  // CHECK: %cst -> %cst
  %cst0 = arith.constant dense<0.000000e+00> : tensor<1x16x16xf16, #A_SHARED>
-  %index = arith.constant 0 : index
-  // CHECK-NEXT: %extracted_slice -> %cst
-  %cst1 = tensor.extract_slice %cst0[%index, 0, 0][1, 16, 16][1, 1, 1] : tensor<1x16x16xf16, #A_SHARED> to tensor<16x16xf16, #A_SHARED>
+  %index = arith.constant 0 : i32
+  // CHECK-NEXT: %0 -> %cst
+  %cst1 = triton_gpu.extract_slice %cst0[%index, 0, 0][1, 16, 16][1, 1, 1] : tensor<1x16x16xf16, #A_SHARED> to tensor<16x16xf16, #A_SHARED>
  return
 }

@@ -169,9 +169,9 @@ func.func @for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B
  // CHECK-NEXT: %0#2 -> %cst,%cst_0
  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) {
    scf.if %i1 {
-      %index = arith.constant 8 : index
-      // CHECK-NEXT: %extracted_slice -> %cst,%cst_0
-      %cst0 = tensor.extract_slice %a_shared[%index, 0][1, 32][1, 1] : tensor<128x32xf16, #A_SHARED> to tensor<32xf16, #A_SHARED>
+      %index = arith.constant 8 : i32
+      // CHECK-NEXT: %1 -> %cst,%cst_0
+      %cst0 = triton_gpu.extract_slice %a_shared[%index, 0][1, 32][1, 1] : tensor<128x32xf16, #A_SHARED> to tensor<32xf16, #A_SHARED>
      scf.yield
    }
    scf.yield %b_shared, %a_shared, %a_shared : tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>
--- a/test/Analysis/test-allocation.mlir
+++ b/test/Analysis/test-allocation.mlir
@@ -200,8 +200,8 @@ func.func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
 func.func @extract_slice(%A : !tt.ptr<f16>) {
  // CHECK: offset = 0, size = 512
  %cst0 = arith.constant dense<0.000000e+00> : tensor<1x16x16xf16, #A_SHARED>
-  %index = arith.constant 0 : index
-  %cst1 = tensor.extract_slice %cst0[%index, 0, 0][1, 16, 16][1,1,1] : tensor<1x16x16xf16, #A_SHARED> to tensor<16x16xf16, #A_SHARED>
+  %index = arith.constant 0 : i32
+  %cst1 = triton_gpu.extract_slice %cst0[%index, 0, 0][1, 16, 16][1,1,1] : tensor<1x16x16xf16, #A_SHARED> to tensor<16x16xf16, #A_SHARED>
  return
  // CHECK-NEXT: size = 512
 }
@@ -284,8 +284,8 @@ func.func @for_if_slice(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f1
  %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) {
    scf.if %i1 {
-      %index = arith.constant 8 : index
-      %cst0 = tensor.extract_slice %a_shared[%index, 0][1, 32][1, 1] : tensor<128x32xf16, #A_SHARED> to tensor<32xf16, #A_SHARED>
+      %index = arith.constant 8 : i32
+      %cst0 = triton_gpu.extract_slice %a_shared[%index, 0][1, 32][1, 1] : tensor<128x32xf16, #A_SHARED> to tensor<32xf16, #A_SHARED>
      scf.yield
    }
    scf.yield %b_shared, %a_shared, %a_shared : tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>
--- a/test/Analysis/test-membar.mlir
+++ b/test/Analysis/test-membar.mlir
@@ -109,8 +109,8 @@ func.func @alloc() {
 // CHECK-LABEL: extract_slice
 func.func @extract_slice() {
  %cst0 = arith.constant dense<0.000000e+00> : tensor<1x16x16xf16, #A_SHARED>
-  %index = arith.constant 0 : index
-  %0 = tensor.extract_slice %cst0[%index, 0, 0][1, 16, 16][1, 1, 1] : tensor<1x16x16xf16, #A_SHARED> to tensor<16x16xf16, #A_SHARED>
+  %index = arith.constant 0 : i32
+  %0 = triton_gpu.extract_slice %cst0[%index, 0, 0][1, 16, 16][1, 1, 1] : tensor<1x16x16xf16, #A_SHARED> to tensor<16x16xf16, #A_SHARED>
  // CHECK: gpu.barrier
  // CHECK-NEXT: triton_gpu.convert_layout
  %1 = triton_gpu.convert_layout %0 : (tensor<16x16xf16, #A_SHARED>) -> tensor<16x16xf16, #AL>