mirror of
https://github.com/ROCm/ROCm.git
synced 2026-04-05 03:01:17 -04:00
ROCM IFU: Lit test fixes
This commit is contained in:
@@ -1,15 +1,17 @@
|
||||
// RUN: triton-opt %s -split-input-file --convert-triton-gpu-to-llvm="target=nvvm" --llvm-optimize-for-nvvm-target | FileCheck %s
|
||||
// RUN: triton-opt %s -split-input-file --convert-triton-gpu-to-llvm=target=rocdl | FileCheck %s --check-prefixes=CHECK,GCN
|
||||
|
||||
// CHECK-LABEL: dedup_by_constancy_full
|
||||
// CHECK-COUNT-5: llvm.add
|
||||
// GCN-COUNT-26: llvm.add
|
||||
// PTX-COUNT-5: llvm.add
|
||||
// CHECK-NOT: llvm.add
|
||||
// CHECK: llvm.icmp "slt"
|
||||
// CHECK-NOT: llvm.icmp "slt"
|
||||
// PTX-NOT: llvm.icmp "slt"
|
||||
// CHECK: llvm.sdiv
|
||||
// CHECK-NOT: llvm.sdiv
|
||||
// CHECK: llvm.getelementptr %arg0[[[REGISTER:%[0-9]+]]]
|
||||
// CHECK-COUNT-7: llvm.getelementptr %arg0[[[REGISTER]]]
|
||||
// CHECK-NOT: llvm.getelementptr %arg0[[[REGISTER]]]
|
||||
// PTX-NOT: llvm.sdiv
|
||||
// PTX: llvm.getelementptr %arg0[[[REGISTER:%[0-9]+]]]
|
||||
// PTX-COUNT-7: llvm.getelementptr %arg0[[[REGISTER]]]
|
||||
// GCN-COUNT-16: llvm.getelementptr
|
||||
// PTX-NOT: llvm.getelementptr %arg0[[[REGISTER]]]
|
||||
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
||||
module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
||||
tt.func public @dedup_by_constancy_full(%arg0: !tt.ptr<f16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
||||
@@ -36,18 +38,21 @@ module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-c
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: dedup_by_constancy_partial
|
||||
// CHECK-COUNT-8: llvm.add
|
||||
// CHECK-NOT: llvm.add
|
||||
// PTX-COUNT-8: llvm.add
|
||||
// GCN-COUNT-26: llvm.add
|
||||
// PTX-NOT: llvm.add
|
||||
// CHECK: llvm.icmp "slt"
|
||||
// CHECK-NOT: llvm.icmp "slt"
|
||||
// CHECK-COUNT-2: llvm.sdiv
|
||||
// CHECK-NOT: llvm.sdiv
|
||||
// CHECK: llvm.getelementptr %arg0[[[REGISTER1:%[0-9]+]]]
|
||||
// CHECK-COUNT-3: llvm.getelementptr %arg0[[[REGISTER1]]]
|
||||
// CHECK-NOT: llvm.getelementptr %arg0[[[REGISTER1]]]
|
||||
// CHECK: llvm.getelementptr %arg0[[[REGISTER2:%[0-9]+]]]
|
||||
// CHECK-COUNT-3: llvm.getelementptr %arg0[[[REGISTER2]]]
|
||||
// CHECK-NOT: llvm.getelementptr %arg0[[[REGISTER2]]]
|
||||
// PTX-NOT: llvm.icmp "slt"
|
||||
// PTX-COUNT-2: llvm.sdiv
|
||||
// GCN-COUNT-8: llvm.sdiv
|
||||
// PTX-NOT: llvm.sdiv
|
||||
// PTX: llvm.getelementptr %arg0[[[REGISTER1:%[0-9]+]]]
|
||||
// PTX-COUNT-3: llvm.getelementptr %arg0[[[REGISTER1]]]
|
||||
// GCN-COUNT-16: llvm.getelementptr
|
||||
// PTX-NOT: llvm.getelementptr %arg0[[[REGISTER1]]]
|
||||
// PTX: llvm.getelementptr %arg0[[[REGISTER2:%[0-9]+]]]
|
||||
// PTX-COUNT-3: llvm.getelementptr %arg0[[[REGISTER2]]]
|
||||
// PTX-NOT: llvm.getelementptr %arg0[[[REGISTER2]]]
|
||||
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
||||
module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
||||
tt.func public @dedup_by_constancy_partial(%arg0: !tt.ptr<f16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
||||
|
||||
@@ -26,7 +26,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
|
||||
%6 = arith.divsi %0, %5 : i32
|
||||
%7 = arith.muli %6, %c4_i32 : i32
|
||||
%8 = arith.subi %2, %7 : i32
|
||||
%9 = "triton_gpu.cmpi"(%8, %c4_i32) <{predicate = 2 : i64}> : (i32, i32) -> i1
|
||||
%9 = arith.cmpi "slt", %8, %c4_i32: i32
|
||||
%10 = arith.select %9, %8, %c4_i32 : i32
|
||||
%11 = arith.remsi %0, %10 : i32
|
||||
%12 = arith.addi %7, %11 : i32
|
||||
@@ -104,9 +104,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
|
||||
%84 = tt.broadcast %43 : (tensor<1x64xi32, #blocked1>) -> tensor<64x64xi32, #blocked1>
|
||||
%85 = tt.addptr %83, %84 : tensor<64x64x!tt.ptr<f16>, #blocked1>, tensor<64x64xi32, #blocked1>
|
||||
%86 = tt.splat %arg3 : (i32) -> tensor<64x1xi32, #blocked1>
|
||||
%87 = "triton_gpu.cmpi"(%27, %86) <{predicate = 2 : i64}> : (tensor<64x1xi32, #blocked1>, tensor<64x1xi32, #blocked1>) -> tensor<64x1xi1, #blocked1>
|
||||
%87 = arith.cmpi "slt", %27, %86 : tensor<64x1xi32, #blocked1>
|
||||
%88 = tt.splat %arg4 : (i32) -> tensor<1x64xi32, #blocked1>
|
||||
%89 = "triton_gpu.cmpi"(%43, %88) <{predicate = 2 : i64}> : (tensor<1x64xi32, #blocked1>, tensor<1x64xi32, #blocked1>) -> tensor<1x64xi1, #blocked1>
|
||||
%89 = arith.cmpi "slt", %43, %88 : tensor<1x64xi32, #blocked1>
|
||||
%90 = tt.broadcast %87 : (tensor<64x1xi1, #blocked1>) -> tensor<64x64xi1, #blocked1>
|
||||
%91 = tt.broadcast %89 : (tensor<1x64xi1, #blocked1>) -> tensor<64x64xi1, #blocked1>
|
||||
%92 = arith.andi %90, %91 : tensor<64x64xi1, #blocked1>
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// RUN: triton-opt %s -split-input-file --convert-triton-gpu-to-llvm="target=rocdl" 2>/dev/null | FileCheck --check-prefixes=CHECK,GCN %s
|
||||
// RUN: triton-opt %s -split-input-file --convert-triton-gpu-to-llvm="target=rocdl" | FileCheck --check-prefixes=CHECK,GCN %s
|
||||
|
||||
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
|
||||
// CHECK: llvm.func @test_empty_kernel(%arg0: i64, %arg1: !llvm.ptr<f16, 1>)
|
||||
@@ -1654,7 +1654,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
|
||||
// PTX-SAME: @$3 atom.global.gpu.relaxed.add.f32
|
||||
// PTX: llvm.inline_asm
|
||||
// PTX-SAME: @$3 atom.global.gpu.relaxed.add.f32
|
||||
%0 = "tt.atomic_rmw" (%arg0, %arg2, %arg1) {atomic_rmw_op = 5 : i32, sem = 1, scope = 1 : i32} : (tensor<256x!tt.ptr<f32>, #blocked0>, tensor<256xf32, #blocked0>, tensor<256xi1, #blocked0>) -> tensor<256xf32, #blocked0>
|
||||
%0 = "tt.atomic_rmw" (%arg0, %arg2, %arg1) {atomic_rmw_op = 5 : i32, sem = 1 : i32, scope = 1 : i32} : (tensor<256x!tt.ptr<f32>, #blocked0>, tensor<256xf32, #blocked0>, tensor<256xi1, #blocked0>) -> tensor<256xf32, #blocked0>
|
||||
tt.return
|
||||
}
|
||||
}
|
||||
@@ -1669,7 +1669,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
|
||||
// PTX: llvm.icmp "eq"
|
||||
// PTX: llvm.inline_asm
|
||||
// PTX-SAME: @$3 atom.global.gpu.relaxed.add.f32
|
||||
%0 = "tt.atomic_rmw" (%arg0, %arg2, %arg1) {atomic_rmw_op = 5 : i32, sem = 1, scope = 1: i32} : (!tt.ptr<f32>, f32, i1) -> f32
|
||||
%0 = "tt.atomic_rmw" (%arg0, %arg2, %arg1) {atomic_rmw_op = 5 : i32, sem = 1: i32 , scope = 1 : i32} : (!tt.ptr<f32>, f32, i1) -> f32
|
||||
tt.return
|
||||
}
|
||||
}
|
||||
@@ -2115,7 +2115,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 :
|
||||
// PTX: llvm.inline_asm
|
||||
// PTX-SAME: @$3 atom.global.gpu.add.noftz.f16x2
|
||||
// GCN-COUNT-8: llvm.atomicrmw fadd {{.*}} monotonic : !llvm.ptr<f16, 1>, f16
|
||||
%8 = "tt.atomic_rmw"(%5, %6, %7) {atomic_rmw_op = 5 : i32, sem = 1: i32} : (tensor<32x32x!tt.ptr<f16>, #blocked>, tensor<32x32xf16, #blocked>, tensor<32x32xi1, #blocked>) -> tensor<32x32xf16, #blocked>
|
||||
%8 = "tt.atomic_rmw"(%5, %6, %7) {atomic_rmw_op = 5 : i32, sem = 1: i32, scope = 1: i32} : (tensor<32x32x!tt.ptr<f16>, #blocked>, tensor<32x32xf16, #blocked>, tensor<32x32xi1, #blocked>) -> tensor<32x32xf16, #blocked>
|
||||
tt.return
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu 2>&1 | FileCheck %s --check-prefix=GPU
|
||||
// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu -convert-triton-gpu-to-llvm 2>&1 | FileCheck %s --check-prefix=LLVM
|
||||
// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu -convert-triton-gpu-to-llvm=target=rocdl | FileCheck %s --check-prefix=GCN
|
||||
// There is an issue with the op
|
||||
// XFAIL: *
|
||||
|
||||
// GPU: %9 = "tt.atomic_cas"(%8, %cst_0, %cst) <{scope = 2 : i32, sem = 4 : i32}> : (tensor<2x!tt.ptr<i64, 1>, #blocked>, tensor<2xi64, #blocked>, tensor<2xi64, #blocked>) -> tensor<2xi64, #blocked>
|
||||
// LLVM: llvm.inline_asm {{.*}} "mov.u64 $0, 0x0;\0A\09@$4 atom.global.acq_rel.cta.cas.b64 $0, [ $1 + 0 ], $2, $3;", "=l,l,l,l,b"
|
||||
// PTX: llvm.inline_asm {{.*}} "mov.u64 $0, 0x0;\0A\09@$4 atom.global.acq_rel.cta.cas.b64 $0, [ $1 + 0 ], $2, $3;", "=l,l,l,l,b"
|
||||
|
||||
module {
|
||||
tt.func public @atomic_cas_kernel_0d1d2e(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
||||
|
||||
@@ -337,11 +337,11 @@ tt.func @post_load_inv(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32},
|
||||
%107 = arith.muli %130, %c32_i32 : i32
|
||||
%108 = arith.subi %arg5, %107 : i32
|
||||
%109 = tt.splat %108 : (i32) -> tensor<1x32xi32, #AL>
|
||||
%110 = "triton_gpu.cmpi"(%50, %109) <{predicate = 2 : i64}> : (tensor<1x32xi32, #AL>, tensor<1x32xi32, #AL>) -> tensor<1x32xi1, #AL>
|
||||
%110 = arith.cmpi "slt", %50, %109 : tensor<1x32xi32, #AL>
|
||||
%111 = tt.broadcast %110 : (tensor<1x32xi1, #AL>) -> tensor<32x32xi1, #AL>
|
||||
%112 = tt.load %arg11, %111, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x32xf32, #AL>
|
||||
%113 = tt.splat %108 : (i32) -> tensor<32x1xi32, #AL>
|
||||
%114 = "triton_gpu.cmpi"(%66, %113) <{predicate = 2 : i64}> : (tensor<32x1xi32, #AL>, tensor<32x1xi32, #AL>) -> tensor<32x1xi1, #AL>
|
||||
%114 = arith.cmpi "slt", %66, %113 : tensor<32x1xi32, #AL>
|
||||
%115 = tt.broadcast %114 : (tensor<32x1xi1, #AL>) -> tensor<32x32xi1, #AL>
|
||||
%116 = tt.load %arg12, %115, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x32xf32, #AL>
|
||||
%117 = triton_gpu.convert_layout %112 : (tensor<32x32xf32, #AL>) -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>>
|
||||
@@ -395,11 +395,11 @@ tt.func @cross_iter_dep(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32},
|
||||
%141 = arith.muli %161, %c32_i32 : i32
|
||||
%142 = arith.subi %arg5, %141 : i32
|
||||
%143 = tt.splat %142 : (i32) -> tensor<1x32xi32, #AL>
|
||||
%144 = "triton_gpu.cmpi"(%65, %143) <{predicate = 2 : i64}> : (tensor<1x32xi32, #AL>, tensor<1x32xi32, #AL>) -> tensor<1x32xi1, #AL>
|
||||
%144 = arith.cmpi "slt", %65, %143 : tensor<1x32xi32, #AL>
|
||||
%145 = tt.broadcast %144 : (tensor<1x32xi1, #AL>) -> tensor<32x32xi1, #AL>
|
||||
%146 = tt.load %arg11, %145, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x32xf32, #AL>
|
||||
%147 = tt.splat %142 : (i32) -> tensor<32x1xi32, #AL>
|
||||
%148 = "triton_gpu.cmpi"(%88, %147) <{predicate = 2 : i64}> : (tensor<32x1xi32, #AL>, tensor<32x1xi32, #AL>) -> tensor<32x1xi1, #AL>
|
||||
%148 = arith.cmpi "slt", %88, %147: tensor<32x1xi32, #AL>
|
||||
%149 = tt.broadcast %148 : (tensor<32x1xi1, #AL>) -> tensor<32x32xi1, #AL>
|
||||
%150 = tt.load %arg12, %149, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x32xf32, #AL>
|
||||
%151 = triton_gpu.convert_layout %146 : (tensor<32x32xf32, #AL>) -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>>
|
||||
@@ -456,7 +456,7 @@ tt.func @matmul_mixed_kernel(%arg0: !tt.ptr<f8E4M3FNUZ> {tt.divisibility = 16 :
|
||||
%4 = arith.divsi %3, %c32_i32 : i32
|
||||
%5 = arith.divsi %0, %4 : i32
|
||||
%6 = arith.subi %2, %5 : i32
|
||||
%7 = "triton_gpu.cmpi"(%6, %c1_i32) <{predicate = 2 : i64}> : (i32, i32) -> i1
|
||||
%7 = arith.cmpi "slt", %6, %c1_i32: i32
|
||||
%8 = arith.select %7, %6, %c1_i32 : i32
|
||||
%9 = arith.remsi %0, %8 : i32
|
||||
%10 = arith.addi %5, %9 : i32
|
||||
@@ -519,11 +519,11 @@ tt.func @matmul_mixed_kernel(%arg0: !tt.ptr<f8E4M3FNUZ> {tt.divisibility = 16 :
|
||||
%86 = arith.muli %arg9, %c32_i32 : i32
|
||||
%87 = arith.subi %arg5, %86 : i32
|
||||
%88 = tt.splat %87 : (i32) -> tensor<1x32xi32, #blocked1>
|
||||
%89 = "triton_gpu.cmpi"(%43, %88) <{predicate = 2 : i64}> : (tensor<1x32xi32, #blocked1>, tensor<1x32xi32, #blocked1>) -> tensor<1x32xi1, #blocked1>
|
||||
%89 = arith.cmpi "slt", %43, %88 : tensor<1x32xi32, #blocked1>
|
||||
%90 = tt.broadcast %89 : (tensor<1x32xi1, #blocked1>) -> tensor<64x32xi1, #blocked1>
|
||||
%91 = tt.load %arg11, %90, %63 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x32xf8E4M3FNUZ, #blocked1>
|
||||
%92 = tt.splat %87 : (i32) -> tensor<32x1xi32, #blocked2>
|
||||
%93 = "triton_gpu.cmpi"(%52, %92) <{predicate = 2 : i64}> : (tensor<32x1xi32, #blocked2>, tensor<32x1xi32, #blocked2>) -> tensor<32x1xi1, #blocked2>
|
||||
%93 = arith.cmpi "slt", %52, %92: tensor<32x1xi32, #blocked2>
|
||||
%94 = tt.broadcast %93 : (tensor<32x1xi1, #blocked2>) -> tensor<32x32xi1, #blocked2>
|
||||
%95 = tt.load %arg12, %94, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x32xf16, #blocked2>
|
||||
%96 = tt.fp_to_fp %91 : tensor<64x32xf8E4M3FNUZ, #blocked1> -> tensor<64x32xf16, #blocked1>
|
||||
@@ -546,9 +546,9 @@ tt.func @matmul_mixed_kernel(%arg0: !tt.ptr<f8E4M3FNUZ> {tt.divisibility = 16 :
|
||||
%76 = tt.broadcast %73 : (tensor<1x32xi32, #blocked2>) -> tensor<64x32xi32, #blocked2>
|
||||
%77 = tt.addptr %75, %76 : tensor<64x32x!tt.ptr<f16>, #blocked2>, tensor<64x32xi32, #blocked2>
|
||||
%78 = tt.splat %arg3 : (i32) -> tensor<64x1xi32, #blocked2>
|
||||
%79 = "triton_gpu.cmpi"(%68, %78) <{predicate = 2 : i64}> : (tensor<64x1xi32, #blocked2>, tensor<64x1xi32, #blocked2>) -> tensor<64x1xi1, #blocked2>
|
||||
%79 = arith.cmpi "slt", %68, %78 : tensor<64x1xi32, #blocked2>
|
||||
%80 = tt.splat %arg4 : (i32) -> tensor<1x32xi32, #blocked2>
|
||||
%81 = "triton_gpu.cmpi"(%74, %80) <{predicate = 2 : i64}> : (tensor<1x32xi32, #blocked2>, tensor<1x32xi32, #blocked2>) -> tensor<1x32xi1, #blocked2>
|
||||
%81 = arith.cmpi "slt", %74, %80 : tensor<1x32xi32, #blocked2>
|
||||
%82 = tt.broadcast %79 : (tensor<64x1xi1, #blocked2>) -> tensor<64x32xi1, #blocked2>
|
||||
%83 = tt.broadcast %81 : (tensor<1x32xi1, #blocked2>) -> tensor<64x32xi1, #blocked2>
|
||||
%84 = arith.andi %82, %83 : tensor<64x32xi1, #blocked2>
|
||||
|
||||
Reference in New Issue
Block a user