[FRONTEND][BACKEND][OPTIMIZER] Loops now use 64-bit indices when necessary (#1261)

* Frontend:
  - `int` kernel arguments are always signed
- Loop induction variable is now determine by integer promotion on
lb/ub/step
* Optimizer:
  -  Added new ExtractSliceOp that enforces 32-bit offsets
* Backend:
    - Use 64-bit indices when lowering functions and control flow
    - Removed `idx_val` macro and replaced it with `i32_val`
    - Cleaned up comments
- Added new ArithToIndex pass to make sure operations on indices are
done with the `index` dialect, that gets converted to LLVM separately
using a 64-bit target
This commit is contained in:
Philippe Tillet
2023-03-01 23:09:48 -08:00
committed by GitHub
parent 90fcb38c7b
commit fa0fbc937f
32 changed files with 441 additions and 248 deletions

View File

@@ -93,9 +93,9 @@ func.func @insert_slice(%A : !tt.ptr<f16>, %i1 : i1) {
func.func @extract_slice(%A : !tt.ptr<f16>) {
// CHECK: %cst -> %cst
%cst0 = arith.constant dense<0.000000e+00> : tensor<1x16x16xf16, #A_SHARED>
%index = arith.constant 0 : index
// CHECK-NEXT: %extracted_slice -> %cst
%cst1 = tensor.extract_slice %cst0[%index, 0, 0][1, 16, 16][1, 1, 1] : tensor<1x16x16xf16, #A_SHARED> to tensor<16x16xf16, #A_SHARED>
%index = arith.constant 0 : i32
// CHECK-NEXT: %0 -> %cst
%cst1 = triton_gpu.extract_slice %cst0[%index, 0, 0][1, 16, 16][1, 1, 1] : tensor<1x16x16xf16, #A_SHARED> to tensor<16x16xf16, #A_SHARED>
return
}
@@ -169,9 +169,9 @@ func.func @for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B
// CHECK-NEXT: %0#2 -> %cst,%cst_0
%a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) {
scf.if %i1 {
%index = arith.constant 8 : index
// CHECK-NEXT: %extracted_slice -> %cst,%cst_0
%cst0 = tensor.extract_slice %a_shared[%index, 0][1, 32][1, 1] : tensor<128x32xf16, #A_SHARED> to tensor<32xf16, #A_SHARED>
%index = arith.constant 8 : i32
// CHECK-NEXT: %1 -> %cst,%cst_0
%cst0 = triton_gpu.extract_slice %a_shared[%index, 0][1, 32][1, 1] : tensor<128x32xf16, #A_SHARED> to tensor<32xf16, #A_SHARED>
scf.yield
}
scf.yield %b_shared, %a_shared, %a_shared : tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>

View File

@@ -200,8 +200,8 @@ func.func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
func.func @extract_slice(%A : !tt.ptr<f16>) {
// CHECK: offset = 0, size = 512
%cst0 = arith.constant dense<0.000000e+00> : tensor<1x16x16xf16, #A_SHARED>
%index = arith.constant 0 : index
%cst1 = tensor.extract_slice %cst0[%index, 0, 0][1, 16, 16][1,1,1] : tensor<1x16x16xf16, #A_SHARED> to tensor<16x16xf16, #A_SHARED>
%index = arith.constant 0 : i32
%cst1 = triton_gpu.extract_slice %cst0[%index, 0, 0][1, 16, 16][1,1,1] : tensor<1x16x16xf16, #A_SHARED> to tensor<16x16xf16, #A_SHARED>
return
// CHECK-NEXT: size = 512
}
@@ -284,8 +284,8 @@ func.func @for_if_slice(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f1
%c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
%a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) {
scf.if %i1 {
%index = arith.constant 8 : index
%cst0 = tensor.extract_slice %a_shared[%index, 0][1, 32][1, 1] : tensor<128x32xf16, #A_SHARED> to tensor<32xf16, #A_SHARED>
%index = arith.constant 8 : i32
%cst0 = triton_gpu.extract_slice %a_shared[%index, 0][1, 32][1, 1] : tensor<128x32xf16, #A_SHARED> to tensor<32xf16, #A_SHARED>
scf.yield
}
scf.yield %b_shared, %a_shared, %a_shared : tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>

View File

@@ -109,8 +109,8 @@ func.func @alloc() {
// CHECK-LABEL: extract_slice
func.func @extract_slice() {
%cst0 = arith.constant dense<0.000000e+00> : tensor<1x16x16xf16, #A_SHARED>
%index = arith.constant 0 : index
%0 = tensor.extract_slice %cst0[%index, 0, 0][1, 16, 16][1, 1, 1] : tensor<1x16x16xf16, #A_SHARED> to tensor<16x16xf16, #A_SHARED>
%index = arith.constant 0 : i32
%0 = triton_gpu.extract_slice %cst0[%index, 0, 0][1, 16, 16][1, 1, 1] : tensor<1x16x16xf16, #A_SHARED> to tensor<16x16xf16, #A_SHARED>
// CHECK: gpu.barrier
// CHECK-NEXT: triton_gpu.convert_layout
%1 = triton_gpu.convert_layout %0 : (tensor<16x16xf16, #A_SHARED>) -> tensor<16x16xf16, #AL>