ROCM IFU: Lit test fixes

This commit is contained in:
Michael Melesse
2023-12-11 17:00:35 -06:00
parent 64a0924381
commit 50a6db3afd
5 changed files with 43 additions and 36 deletions

View File

@@ -1,15 +1,17 @@
// RUN: triton-opt %s -split-input-file --convert-triton-gpu-to-llvm="target=nvvm" --llvm-optimize-for-nvvm-target | FileCheck %s
// RUN: triton-opt %s -split-input-file --convert-triton-gpu-to-llvm=target=rocdl | FileCheck %s --check-prefixes=CHECK,GCN
// CHECK-LABEL: dedup_by_constancy_full
// CHECK-COUNT-5: llvm.add
// GCN-COUNT-26: llvm.add
// PTX-COUNT-5: llvm.add
// CHECK-NOT: llvm.add
// CHECK: llvm.icmp "slt"
// CHECK-NOT: llvm.icmp "slt"
// PTX-NOT: llvm.icmp "slt"
// CHECK: llvm.sdiv
// CHECK-NOT: llvm.sdiv
// CHECK: llvm.getelementptr %arg0[[[REGISTER:%[0-9]+]]]
// CHECK-COUNT-7: llvm.getelementptr %arg0[[[REGISTER]]]
// CHECK-NOT: llvm.getelementptr %arg0[[[REGISTER]]]
// PTX-NOT: llvm.sdiv
// PTX: llvm.getelementptr %arg0[[[REGISTER:%[0-9]+]]]
// PTX-COUNT-7: llvm.getelementptr %arg0[[[REGISTER]]]
// GCN-COUNT-16: llvm.getelementptr
// PTX-NOT: llvm.getelementptr %arg0[[[REGISTER]]]
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
tt.func public @dedup_by_constancy_full(%arg0: !tt.ptr<f16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
@@ -36,18 +38,21 @@ module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-c
// -----
// CHECK-LABEL: dedup_by_constancy_partial
// CHECK-COUNT-8: llvm.add
// CHECK-NOT: llvm.add
// PTX-COUNT-8: llvm.add
// GCN-COUNT-26: llvm.add
// PTX-NOT: llvm.add
// CHECK: llvm.icmp "slt"
// CHECK-NOT: llvm.icmp "slt"
// CHECK-COUNT-2: llvm.sdiv
// CHECK-NOT: llvm.sdiv
// CHECK: llvm.getelementptr %arg0[[[REGISTER1:%[0-9]+]]]
// CHECK-COUNT-3: llvm.getelementptr %arg0[[[REGISTER1]]]
// CHECK-NOT: llvm.getelementptr %arg0[[[REGISTER1]]]
// CHECK: llvm.getelementptr %arg0[[[REGISTER2:%[0-9]+]]]
// CHECK-COUNT-3: llvm.getelementptr %arg0[[[REGISTER2]]]
// CHECK-NOT: llvm.getelementptr %arg0[[[REGISTER2]]]
// PTX-NOT: llvm.icmp "slt"
// PTX-COUNT-2: llvm.sdiv
// GCN-COUNT-8: llvm.sdiv
// PTX-NOT: llvm.sdiv
// PTX: llvm.getelementptr %arg0[[[REGISTER1:%[0-9]+]]]
// PTX-COUNT-3: llvm.getelementptr %arg0[[[REGISTER1]]]
// GCN-COUNT-16: llvm.getelementptr
// PTX-NOT: llvm.getelementptr %arg0[[[REGISTER1]]]
// PTX: llvm.getelementptr %arg0[[[REGISTER2:%[0-9]+]]]
// PTX-COUNT-3: llvm.getelementptr %arg0[[[REGISTER2]]]
// PTX-NOT: llvm.getelementptr %arg0[[[REGISTER2]]]
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
tt.func public @dedup_by_constancy_partial(%arg0: !tt.ptr<f16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 8 : i32}) attributes {noinline = false} {

View File

@@ -26,7 +26,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
%6 = arith.divsi %0, %5 : i32
%7 = arith.muli %6, %c4_i32 : i32
%8 = arith.subi %2, %7 : i32
%9 = "triton_gpu.cmpi"(%8, %c4_i32) <{predicate = 2 : i64}> : (i32, i32) -> i1
%9 = arith.cmpi "slt", %8, %c4_i32: i32
%10 = arith.select %9, %8, %c4_i32 : i32
%11 = arith.remsi %0, %10 : i32
%12 = arith.addi %7, %11 : i32
@@ -104,9 +104,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
%84 = tt.broadcast %43 : (tensor<1x64xi32, #blocked1>) -> tensor<64x64xi32, #blocked1>
%85 = tt.addptr %83, %84 : tensor<64x64x!tt.ptr<f16>, #blocked1>, tensor<64x64xi32, #blocked1>
%86 = tt.splat %arg3 : (i32) -> tensor<64x1xi32, #blocked1>
%87 = "triton_gpu.cmpi"(%27, %86) <{predicate = 2 : i64}> : (tensor<64x1xi32, #blocked1>, tensor<64x1xi32, #blocked1>) -> tensor<64x1xi1, #blocked1>
%87 = arith.cmpi "slt", %27, %86 : tensor<64x1xi32, #blocked1>
%88 = tt.splat %arg4 : (i32) -> tensor<1x64xi32, #blocked1>
%89 = "triton_gpu.cmpi"(%43, %88) <{predicate = 2 : i64}> : (tensor<1x64xi32, #blocked1>, tensor<1x64xi32, #blocked1>) -> tensor<1x64xi1, #blocked1>
%89 = arith.cmpi "slt", %43, %88 : tensor<1x64xi32, #blocked1>
%90 = tt.broadcast %87 : (tensor<64x1xi1, #blocked1>) -> tensor<64x64xi1, #blocked1>
%91 = tt.broadcast %89 : (tensor<1x64xi1, #blocked1>) -> tensor<64x64xi1, #blocked1>
%92 = arith.andi %90, %91 : tensor<64x64xi1, #blocked1>

View File

@@ -1,4 +1,4 @@
// RUN: triton-opt %s -split-input-file --convert-triton-gpu-to-llvm="target=rocdl" 2>/dev/null | FileCheck --check-prefixes=CHECK,GCN %s
// RUN: triton-opt %s -split-input-file --convert-triton-gpu-to-llvm="target=rocdl" | FileCheck --check-prefixes=CHECK,GCN %s
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
// CHECK: llvm.func @test_empty_kernel(%arg0: i64, %arg1: !llvm.ptr<f16, 1>)
@@ -1654,7 +1654,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
// PTX-SAME: @$3 atom.global.gpu.relaxed.add.f32
// PTX: llvm.inline_asm
// PTX-SAME: @$3 atom.global.gpu.relaxed.add.f32
%0 = "tt.atomic_rmw" (%arg0, %arg2, %arg1) {atomic_rmw_op = 5 : i32, sem = 1, scope = 1 : i32} : (tensor<256x!tt.ptr<f32>, #blocked0>, tensor<256xf32, #blocked0>, tensor<256xi1, #blocked0>) -> tensor<256xf32, #blocked0>
%0 = "tt.atomic_rmw" (%arg0, %arg2, %arg1) {atomic_rmw_op = 5 : i32, sem = 1 : i32, scope = 1 : i32} : (tensor<256x!tt.ptr<f32>, #blocked0>, tensor<256xf32, #blocked0>, tensor<256xi1, #blocked0>) -> tensor<256xf32, #blocked0>
tt.return
}
}
@@ -1669,7 +1669,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
// PTX: llvm.icmp "eq"
// PTX: llvm.inline_asm
// PTX-SAME: @$3 atom.global.gpu.relaxed.add.f32
%0 = "tt.atomic_rmw" (%arg0, %arg2, %arg1) {atomic_rmw_op = 5 : i32, sem = 1, scope = 1: i32} : (!tt.ptr<f32>, f32, i1) -> f32
%0 = "tt.atomic_rmw" (%arg0, %arg2, %arg1) {atomic_rmw_op = 5 : i32, sem = 1: i32 , scope = 1 : i32} : (!tt.ptr<f32>, f32, i1) -> f32
tt.return
}
}
@@ -2115,7 +2115,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 :
// PTX: llvm.inline_asm
// PTX-SAME: @$3 atom.global.gpu.add.noftz.f16x2
// GCN-COUNT-8: llvm.atomicrmw fadd {{.*}} monotonic : !llvm.ptr<f16, 1>, f16
%8 = "tt.atomic_rmw"(%5, %6, %7) {atomic_rmw_op = 5 : i32, sem = 1: i32} : (tensor<32x32x!tt.ptr<f16>, #blocked>, tensor<32x32xf16, #blocked>, tensor<32x32xi1, #blocked>) -> tensor<32x32xf16, #blocked>
%8 = "tt.atomic_rmw"(%5, %6, %7) {atomic_rmw_op = 5 : i32, sem = 1: i32, scope = 1: i32} : (tensor<32x32x!tt.ptr<f16>, #blocked>, tensor<32x32xf16, #blocked>, tensor<32x32xi1, #blocked>) -> tensor<32x32xf16, #blocked>
tt.return
}
}