IFU 230517 Resolve merge conflicts

This commit is contained in:
Jason Furmanek
2023-05-17 17:36:44 +00:00
parent 4c4e42e524
commit 78c60742fc
4 changed files with 8 additions and 61 deletions

View File

@@ -152,16 +152,6 @@ struct FuncOpConversion : public FuncOpConversionBase {
auto ctx = funcOp->getContext();
<<<<<<< HEAD
// Set an attribute to indicate this function is a kernel entry.
newFuncOp->setAttr("nvvm.kernel",
rewriter.getIntegerAttr(type::u1Ty(ctx), 1));
#ifndef USE_ROCM
// Set an attribute for maxntidx, it could be used in latter LLVM codegen
// for `nvvm.annotation` metadata.
newFuncOp->setAttr("nvvm.maxntid", rewriter.getI32ArrayAttr(32 * numWarps));
#endif
=======
if (allocation.isRoot(funcOp)) {
// Set an attribute to indicate this function is a kernel entry.
newFuncOp->setAttr("nvvm.kernel",
@@ -174,12 +164,13 @@ struct FuncOpConversion : public FuncOpConversionBase {
ArrayAttr::get(ctx, rewriter.getStringAttr("noinline")));
rewriter.eraseOp(amendedFuncOp);
}
#ifndef USE_ROCM
// Set an attribute for maxntidx, it could be used in latter LLVM codegen
// for `nvvm.annotation` metadata.
newFuncOp->setAttr("nvvm.maxntid", rewriter.getI32ArrayAttr(32 * numWarps));
#endif
// The call graph is updated by mapping the old function to the new one.
allocation.mapFuncOp(funcOp, newFuncOp);
>>>>>>> openai/main
rewriter.eraseOp(funcOp);
return success();

View File

@@ -25,15 +25,12 @@
#include "llvm/IRReader/IRReader.h"
#include "llvm/Linker/Linker.h"
#include "llvm/Support/SourceMgr.h"
<<<<<<< HEAD
#include <iostream>
=======
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#else
>>>>>>> openai/main
#include <dlfcn.h>
#endif
#include <filesystem>

View File

@@ -41,12 +41,8 @@ def test_assert(func: str):
x = torch.arange(0, shape[0], dtype=torch.int32, device='cuda')
y = torch.zeros(shape, dtype=x.dtype, device="cuda")
if func == "device_assert":
<<<<<<< HEAD
kernel_device_assert[(1,)](x, y, num_warps=2, BLOCK=shape[0])
=======
kernel_device_assert[(1,)](x, y, BLOCK=shape[0])
kernel_device_assert_scalar[(1,)](x, y, BLOCK=shape[0])
>>>>>>> openai/main
kernel_device_assert_scalar[(1,)](x, y, num_warps=2, BLOCK=shape[0])
elif func == "assert":
kernel_assert[(1,)](x, y, num_warps=2, BLOCK=shape[0])
elif func == "static_assert":

View File

@@ -1366,21 +1366,12 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
tt.func @matmul_tf32dot(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
%a:tensor<32x16xf32, #shared>, %b:tensor<16x32xf32, #shared>) {
%cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
<<<<<<< HEAD
// PTX: llvm.inline_asm
// PTX-SAME: ldmatrix.sync.aligned.m8n8.x4.shared.b16
// PTX-SAME: (vector<1xf32>, vector<1xf32>, vector<1xf32>, vector<1xf32>)
// PTX-SAME: (i32, i32, i32, i32)
// PTX: llvm.inline_asm
// PTX-SAME: ldmatrix.sync.aligned.m8n8.x4.shared.b16
// PTX-SAME: (vector<1xf32>, vector<1xf32>, vector<1xf32>, vector<1xf32>)
=======
// CHECK: llvm.inline_asm
// CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4.shared.b16
// CHECK-SAME: (i32, i32, i32, i32)
// CHECK: llvm.inline_asm
// CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4.shared.b16
// CHECK-SAME: (i32, i32, i32, i32)
>>>>>>> openai/main
// PTX-SAME: (i32, i32, i32, i32)
%a_mat = triton_gpu.convert_layout %a : (tensor<32x16xf32, #shared>) -> tensor<32x16xf32, #dot_operand_a>
%b_mat = triton_gpu.convert_layout %b : (tensor<16x32xf32, #shared>) -> tensor<16x32xf32, #dot_operand_b>
@@ -1408,20 +1399,12 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
module attributes {"triton_gpu.num-warps" = 4 : i32} {
// CHECK-LABEL: atomic_add_f32
tt.func @atomic_add_f32(%arg0 : tensor<256x!tt.ptr<f32>, #blocked0>, %arg1 : tensor<256xi1, #blocked0>, %arg2 : tensor<256xf32, #blocked0>) {
<<<<<<< HEAD
// GCN-NOT: llvm.inline_asm
// GCN: llvm.atomicrmw fadd {{.*}} monotonic : !llvm.ptr<f32, 1>, f32
// PTX: llvm.icmp "slt"
// PTX: llvm.inline_asm
// PTC: llvm.inline_asm
// PTX-SAME: @$3 atom.global.gpu.add.f32
// PTX: llvm.inline_asm
// PTX-SAME: @$3 atom.global.gpu.add.f32
=======
// CHECK: llvm.inline_asm
// CHECK-SAME: @$3 atom.global.gpu.add.f32
// CHECK: llvm.inline_asm
// CHECK-SAME: @$3 atom.global.gpu.add.f32
>>>>>>> openai/main
%0 = "tt.atomic_rmw" (%arg0, %arg2, %arg1) {atomic_rmw_op = 5 : i32} : (tensor<256x!tt.ptr<f32>, #blocked0>, tensor<256xf32, #blocked0>, tensor<256xi1, #blocked0>) -> tensor<256xf32, #blocked0>
tt.return
}
@@ -1432,18 +1415,12 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
module attributes {"triton_gpu.num-warps" = 4 : i32} {
// CHECK-LABEL: atomic_add_f32_scalar
tt.func @atomic_add_f32_scalar(%arg0 : !tt.ptr<f32>, %arg1 : i1, %arg2 : f32) {
<<<<<<< HEAD
// GCN-NOT: llvm.inline_asm
// GCN: llvm.atomicrmw fadd {{.*}} monotonic : !llvm.ptr<f32, 1>, f32
// PTX: llvm.icmp "eq"
// PTX: llvm.inline_asm
// PTX: llvm.inline_asm
// PTX-SAME: @$3 atom.global.gpu.add.f32
=======
// CHECK: llvm.icmp "eq"
// CHECK: llvm.inline_asm
// CHECK: llvm.inline_asm
// CHECK-SAME: @$3 atom.global.gpu.add.f32
>>>>>>> openai/main
%0 = "tt.atomic_rmw" (%arg0, %arg2, %arg1) {atomic_rmw_op = 5 : i32} : (!tt.ptr<f32>, f32, i1) -> f32
tt.return
}
@@ -1455,21 +1432,13 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
module attributes {"triton_gpu.num-warps" = 4 : i32} {
// CHECK-LABEL: store_f32
tt.func @store_f32(%arg0 : tensor<256x!tt.ptr<f32>, #blocked0>, %arg1 : tensor<256xf32, #blocked0>) {
<<<<<<< HEAD
// GCN-NOT: llvm.inline_asm
// GCN: llvm.store {{.*}} : !llvm.ptr<f32, 1>
// GCN: llvm.store {{.*}} : !llvm.ptr<f32, 1>
// PTX: llvm.icmp "slt"
// PTX: llvm.inline_asm
// PTX-SAME: @$2 st.global.b32
// PTX: llvm.inline_asm
// PTX-SAME: @$2 st.global.b32
=======
// CHECK: llvm.inline_asm
// CHECK-SAME: @$2 st.global.b32
// CHECK: llvm.inline_asm
// CHECK-SAME: @$2 st.global.b32
>>>>>>> openai/main
tt.store %arg0, %arg1 : tensor<256xf32, #blocked0>
tt.return
}
@@ -1480,17 +1449,11 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
module attributes {"triton_gpu.num-warps" = 4 : i32} {
// CHECK-LABEL: store_f32_scalar
tt.func @store_f32_scalar(%arg0 : !tt.ptr<f32>, %arg1 : f32) {
<<<<<<< HEAD
// GCN-NOT: llvm.inline_asm
// GCN: llvm.store {{.*}} : !llvm.ptr<f32, 1>
// PTX: llvm.icmp "slt"
// PTX: llvm.icmp "eq"
// PTX: llvm.inline_asm
// PTX-SAME: @$2 st.global.b32
=======
// CHECK: llvm.icmp "eq"
// CHECK: llvm.inline_asm
// CHECK-SAME: @$2 st.global.b32
>>>>>>> openai/main
tt.store %arg0, %arg1 : f32
tt.return
}