IFU 230517 Resolve merge conflicts

2026-04-05 03:01:17 -04:00 · 2023-05-17 17:36:44 +00:00
parent 4c4e42e524
commit 78c60742fc
4 changed files with 8 additions and 61 deletions
--- a/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp
@@ -152,16 +152,6 @@ struct FuncOpConversion : public FuncOpConversionBase {

    auto ctx = funcOp->getContext();

-<<<<<<< HEAD
-    // Set an attribute to indicate this function is a kernel entry.
-    newFuncOp->setAttr("nvvm.kernel",
-                       rewriter.getIntegerAttr(type::u1Ty(ctx), 1));
-#ifndef USE_ROCM
-    // Set an attribute for maxntidx, it could be used in latter LLVM codegen
-    // for `nvvm.annotation` metadata.
-    newFuncOp->setAttr("nvvm.maxntid", rewriter.getI32ArrayAttr(32 * numWarps));
-#endif
-=======
    if (allocation.isRoot(funcOp)) {
      // Set an attribute to indicate this function is a kernel entry.
      newFuncOp->setAttr("nvvm.kernel",
@@ -174,12 +164,13 @@ struct FuncOpConversion : public FuncOpConversionBase {
          ArrayAttr::get(ctx, rewriter.getStringAttr("noinline")));
      rewriter.eraseOp(amendedFuncOp);
    }
+#ifndef USE_ROCM
    // Set an attribute for maxntidx, it could be used in latter LLVM codegen
    // for `nvvm.annotation` metadata.
    newFuncOp->setAttr("nvvm.maxntid", rewriter.getI32ArrayAttr(32 * numWarps));
+#endif
    // The call graph is updated by mapping the old function to the new one.
    allocation.mapFuncOp(funcOp, newFuncOp);
->>>>>>> openai/main

    rewriter.eraseOp(funcOp);
    return success();
--- a/lib/Target/LLVMIR/LLVMIRTranslation.cpp
+++ b/lib/Target/LLVMIR/LLVMIRTranslation.cpp
@@ -25,15 +25,12 @@
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/Support/SourceMgr.h"
-<<<<<<< HEAD

 #include <iostream>
-=======
 #ifdef _WIN32
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #else
->>>>>>> openai/main
 #include <dlfcn.h>
 #endif
 #include <filesystem>
--- a/python/test/unit/language/assert_helper.py
+++ b/python/test/unit/language/assert_helper.py
@@ -41,12 +41,8 @@ def test_assert(func: str):
    x = torch.arange(0, shape[0], dtype=torch.int32, device='cuda')
    y = torch.zeros(shape, dtype=x.dtype, device="cuda")
    if func == "device_assert":
-<<<<<<< HEAD
        kernel_device_assert[(1,)](x, y, num_warps=2, BLOCK=shape[0])
-=======
-        kernel_device_assert[(1,)](x, y, BLOCK=shape[0])
-        kernel_device_assert_scalar[(1,)](x, y, BLOCK=shape[0])
->>>>>>> openai/main
+        kernel_device_assert_scalar[(1,)](x, y, num_warps=2, BLOCK=shape[0])
    elif func == "assert":
        kernel_assert[(1,)](x, y, num_warps=2, BLOCK=shape[0])
    elif func == "static_assert":
--- a/test/Conversion/tritongpu_to_llvm.mlir
+++ b/test/Conversion/tritongpu_to_llvm.mlir
@@ -1366,21 +1366,12 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
  tt.func @matmul_tf32dot(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
  %a:tensor<32x16xf32, #shared>, %b:tensor<16x32xf32, #shared>) {
    %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
-<<<<<<< HEAD
    // PTX: llvm.inline_asm
    // PTX-SAME: ldmatrix.sync.aligned.m8n8.x4.shared.b16
-    // PTX-SAME: (vector<1xf32>, vector<1xf32>, vector<1xf32>, vector<1xf32>)
+    // PTX-SAME: (i32, i32, i32, i32)
    // PTX: llvm.inline_asm
    // PTX-SAME: ldmatrix.sync.aligned.m8n8.x4.shared.b16
-    // PTX-SAME: (vector<1xf32>, vector<1xf32>, vector<1xf32>, vector<1xf32>)
-=======
-    // CHECK: llvm.inline_asm
-    // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4.shared.b16
-    // CHECK-SAME: (i32, i32, i32, i32)
-    // CHECK: llvm.inline_asm
-    // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4.shared.b16
-    // CHECK-SAME: (i32, i32, i32, i32)
->>>>>>> openai/main
+    // PTX-SAME: (i32, i32, i32, i32)
    %a_mat = triton_gpu.convert_layout %a : (tensor<32x16xf32, #shared>) -> tensor<32x16xf32, #dot_operand_a>
    %b_mat = triton_gpu.convert_layout %b : (tensor<16x32xf32, #shared>) -> tensor<16x32xf32, #dot_operand_b>

@@ -1408,20 +1399,12 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
 module attributes {"triton_gpu.num-warps" = 4 : i32} {
  // CHECK-LABEL: atomic_add_f32
  tt.func @atomic_add_f32(%arg0 : tensor<256x!tt.ptr<f32>, #blocked0>, %arg1 : tensor<256xi1, #blocked0>, %arg2 : tensor<256xf32, #blocked0>) {
-<<<<<<< HEAD
    // GCN-NOT: llvm.inline_asm
    // GCN: llvm.atomicrmw fadd {{.*}}  monotonic  : !llvm.ptr<f32, 1>, f32
-    // PTX: llvm.icmp "slt"
-    // PTX: llvm.inline_asm
+    // PTC: llvm.inline_asm
    // PTX-SAME: @$3 atom.global.gpu.add.f32
    // PTX: llvm.inline_asm
    // PTX-SAME: @$3 atom.global.gpu.add.f32
-=======
-    // CHECK: llvm.inline_asm
-    // CHECK-SAME: @$3 atom.global.gpu.add.f32
-    // CHECK: llvm.inline_asm
-    // CHECK-SAME: @$3 atom.global.gpu.add.f32
->>>>>>> openai/main
    %0 = "tt.atomic_rmw" (%arg0, %arg2, %arg1) {atomic_rmw_op = 5 : i32} : (tensor<256x!tt.ptr<f32>, #blocked0>, tensor<256xf32, #blocked0>, tensor<256xi1, #blocked0>) -> tensor<256xf32, #blocked0>
    tt.return
  }
@@ -1432,18 +1415,12 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
 module attributes {"triton_gpu.num-warps" = 4 : i32} {
  // CHECK-LABEL: atomic_add_f32_scalar
  tt.func @atomic_add_f32_scalar(%arg0 : !tt.ptr<f32>, %arg1 : i1, %arg2 : f32) {
-<<<<<<< HEAD
    // GCN-NOT: llvm.inline_asm
    // GCN: llvm.atomicrmw fadd {{.*}}  monotonic  : !llvm.ptr<f32, 1>, f32
    // PTX: llvm.icmp "eq"
    // PTX: llvm.inline_asm
+    // PTX: llvm.inline_asm
    // PTX-SAME: @$3 atom.global.gpu.add.f32
-=======
-    // CHECK: llvm.icmp "eq"
-    // CHECK: llvm.inline_asm
-    // CHECK: llvm.inline_asm
-    // CHECK-SAME: @$3 atom.global.gpu.add.f32
->>>>>>> openai/main
    %0 = "tt.atomic_rmw" (%arg0, %arg2, %arg1) {atomic_rmw_op = 5 : i32} : (!tt.ptr<f32>, f32, i1) -> f32
    tt.return
  }
@@ -1455,21 +1432,13 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
 module attributes {"triton_gpu.num-warps" = 4 : i32} {
  // CHECK-LABEL: store_f32
  tt.func @store_f32(%arg0 : tensor<256x!tt.ptr<f32>, #blocked0>, %arg1 : tensor<256xf32, #blocked0>) {
-<<<<<<< HEAD
    // GCN-NOT: llvm.inline_asm
    // GCN: llvm.store {{.*}} : !llvm.ptr<f32, 1>
    // GCN: llvm.store {{.*}} : !llvm.ptr<f32, 1>
-    // PTX: llvm.icmp "slt"
    // PTX: llvm.inline_asm
    // PTX-SAME: @$2 st.global.b32
    // PTX: llvm.inline_asm
    // PTX-SAME: @$2 st.global.b32
-=======
-    // CHECK: llvm.inline_asm
-    // CHECK-SAME: @$2 st.global.b32
-    // CHECK: llvm.inline_asm
-    // CHECK-SAME: @$2 st.global.b32
->>>>>>> openai/main
    tt.store %arg0, %arg1 : tensor<256xf32, #blocked0>
    tt.return
  }
@@ -1480,17 +1449,11 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
 module attributes {"triton_gpu.num-warps" = 4 : i32} {
  // CHECK-LABEL: store_f32_scalar
  tt.func @store_f32_scalar(%arg0 : !tt.ptr<f32>, %arg1 : f32) {
-<<<<<<< HEAD
    // GCN-NOT: llvm.inline_asm
    // GCN: llvm.store {{.*}} : !llvm.ptr<f32, 1>
-    // PTX: llvm.icmp "slt"
+    // PTX: llvm.icmp "eq"
    // PTX: llvm.inline_asm
    // PTX-SAME: @$2 st.global.b32
-=======
-    // CHECK: llvm.icmp "eq"
-    // CHECK: llvm.inline_asm
-    // CHECK-SAME: @$2 st.global.b32
->>>>>>> openai/main
    tt.store %arg0, %arg1 : f32
    tt.return
  }