do not emit nvidia inline asm

2026-04-05 03:01:17 -04:00 · 2023-11-02 14:44:41 +00:00
parent 39e8901d7a
commit 8bc417b9b7
2 changed files with 10 additions and 4 deletions
--- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -258,6 +258,9 @@ Value storeShared(ConversionPatternRewriter &rewriter, Location loc, Value ptr,

 Value loadShared(ConversionPatternRewriter &rewriter, Location loc, Value ptr,
                 Value pred) {
+#if USE_ROCM
+  return load(ptr);
+#else
  MLIRContext *ctx = rewriter.getContext();
  auto ptrTy = ptr.getType().cast<LLVMPointerType>();
  assert(ptrTy.getAddressSpace() == 3 && "Invalid addr space for loadShared");
@@ -272,6 +275,7 @@ Value loadShared(ConversionPatternRewriter &rewriter, Location loc, Value ptr,
  auto &ld = builder.create<>("ld")->shared().b(bitwidth);
  ld(dOpr, ptrOpr).predicate(pred, "b");
  return builder.launch(rewriter, loc, elemTy);
+#endif
 }

 static Value commonShflSync(Location loc, ConversionPatternRewriter &rewriter,
--- a/test/Conversion/tritongpu_to_llvm.mlir
+++ b/test/Conversion/tritongpu_to_llvm.mlir
@@ -2013,10 +2013,12 @@ module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-c
 // -----

 //  CHECK-LABEL: copyitem
-//  CHECK: st.shared.b8
-//  CHECK: ld.shared.b8
-//  CHECK-NOT: st.shared.b1
-//  CHECK-NOT: ld.shared.b1
+//  GCN: llvm.store
+//  GCN: llvm.load
+//  PTX: st.shared.b8
+//  PTX: ld.shared.b8
+//  PTX-NOT: st.shared.b1
+//  PTX-NOT: ld.shared.b1
 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
 module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
  tt.func public @copyitem() attributes {noinline = false} {