Merge commit 'ac9fa68d18c777e421bd3f6fb1ddcfd60b6fda33' into ifu-rebase-again

Conflicts: .gitignore .gitmodules README.md bin/triton-translate.cpp include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td include/triton/Target/AMDGCN/AMDGCNTranslation.h include/triton/Target/HSACO/HSACOTranslation.h lib/Analysis/Allocation.cpp lib/Analysis/Utility.cpp lib/Conversion/TritonGPUToLLVM/CMakeLists.txt lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp lib/Conversion/TritonGPUToLLVM/Utility.cpp lib/Conversion/TritonGPUToLLVM/Utility.h lib/Dialect/TritonGPU/IR/Dialect.cpp lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp lib/Target/HSACO/CMakeLists.txt lib/Target/HSACO/HSACOTranslation.cpp lib/Target/LLVMIR/LLVMIRTranslation.cpp python/src/triton.cc python/test/unit/language/test_core.py python/test/unit/operators/test_flash_attention.py python/triton/compiler/compiler.py python/triton/compiler/make_launcher.py python/triton/language/semantic.py python/triton/runtime/jit.py python/tutorials/06-fused-attention.py python/tutorials/11-grouped-gemm.py test/Conversion/tritongpu_to_llvm.mlir
2026-04-05 03:01:17 -04:00 · 2023-11-06 23:10:10 +00:00
parent c65f1e6211 ac9fa68d18
commit 33151a860f
161 changed files with 6530 additions and 3905 deletions
--- a/include/triton/Analysis/Allocation.h
+++ b/include/triton/Analysis/Allocation.h
@@ -21,6 +21,7 @@ class AllocationAnalysis;
 SmallVector<unsigned>
 getScratchConfigForCvtLayout(triton::gpu::ConvertLayoutOp op, unsigned &inVec,
                             unsigned &outVec);
+SmallVector<unsigned> getRepShapeForCvtLayout(triton::gpu::ConvertLayoutOp op);

 } // namespace triton

--- a/include/triton/Analysis/Utility.h
+++ b/include/triton/Analysis/Utility.h
@@ -36,7 +36,9 @@ public:

  triton::ReduceOp getOperation() { return op; }

-  bool isFastReduction();
+  bool isReductionOnLayoutFastAxis();
+
+  unsigned getThreadOffsetOnReductionAxis();

  bool isWarpSynchronous();

@@ -50,14 +52,16 @@ public:

  unsigned getThreadsReductionAxis();

-  SmallVector<unsigned> getScratchConfigBasic();
-
-  SmallVector<SmallVector<unsigned>> getScratchConfigsFast();
+  SmallVector<unsigned> getScratchConfig();

  unsigned getScratchSizeInBytes();

  bool isSupportedLayout();

+  bool isReduceWithinCTA();
+
+  unsigned getAxis() { return axis; }
+
 private:
  triton::ReduceOp op;
  ArrayRef<int64_t> srcShape;
@@ -84,8 +88,12 @@ public:
  unsigned getNonAxisNumThreadsPerCTA();
  // Return the number of warps per CTA along axis dim.
  unsigned getAxisNumWarps();
+  // Return the number of warps per CTA along axis dim with unique data.
+  unsigned getAxisNumWarpsWithUniqueData();
  // Return the number of threads per warp along axis dim.
  unsigned getAxisNumThreadsPerWarp();
+  // Return the number of threads per warp along axis dim with unique data.
+  unsigned getAxisNumThreadsPerWarpWithUniqueData();
  // Return the number of blocks along axis dim.
  unsigned getAxisNumBlocks();
  // Return the number of blocks along non axis dim.
@@ -103,6 +111,7 @@ public:
  Location getLoc() { return scanOp.getLoc(); }
  unsigned getAxis() { return scanOp.getAxis(); }
  triton::gpu::BlockedEncodingAttr getEncoding();
+  llvm::ArrayRef<int64_t> getShape();
  Region &getCombineOp();

 private:
@@ -128,6 +137,10 @@ bool isMmaToDotShortcut(RankedTensorType &srcTy, RankedTensorType &dstTy);

 bool isMmaToMmaShortcut(RankedTensorType &srcTy, RankedTensorType &dstTy);

+// Return true if the src and dst layout match.
+bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
+                                   RankedTensorType dstTy);
+
 // TODO: Move utility functions that belong to ConvertLayoutOp to class
 // ConvertLayoutOpHelper in the future
 bool shouldUseDistSmem(Attribute srcLayout, Attribute dstLayout);
--- a/include/triton/Conversion/TritonGPUToLLVM/Passes.td
+++ b/include/triton/Conversion/TritonGPUToLLVM/Passes.td
@@ -27,9 +27,6 @@ def ConvertTritonGPUToLLVM : Pass<"convert-triton-gpu-to-llvm", "mlir::ModuleOp"
        Option<"computeCapability", "compute-capability",
               "int32_t", /*default*/"80",
               "device compute capability">,
-        Option<"tmaMetadata", "tma-metadata",
-               "mlir::triton::gpu::TMAMetadataTy*", /*default*/"nullptr",
-               "tma metadata to the runtime">,
        Option<"target", "target", "enum Target", "mlir::triton::Target::Default",
               "compile for target compatible LLVM",
               "llvm::cl::values("
--- a/include/triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.h
+++ b/include/triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.h
@@ -21,7 +21,8 @@ enum Target { NVVM, ROCDL, Default = NVVM };

 std::unique_ptr<OperationPass<ModuleOp>> createConvertTritonGPUToLLVMPass();
 std::unique_ptr<OperationPass<ModuleOp>>
-createConvertTritonGPUToLLVMPass(const ConvertTritonGPUToLLVMOptions &options);
+createConvertTritonGPUToLLVMPass(int32_t computeCapability, Target target,
+                                 mlir::triton::gpu::TMAMetadataTy *tmaMetadata);

 } // namespace triton

--- a/include/triton/Dialect/NVGPU/IR/CMakeLists.txt
+++ b/include/triton/Dialect/NVGPU/IR/CMakeLists.txt
@@ -1,3 +1,5 @@
+set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
+
 set(LLVM_TARGET_DEFINITIONS NVGPUOps.td)
 mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=nvgpu)
 mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=nvgpu)
@@ -6,6 +8,8 @@ mlir_tablegen(Ops.h.inc -gen-op-decls)
 mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
 mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
+add_mlir_doc(NVGPUDialect NVGPUDialect dialects/ -gen-dialect-doc)
+add_mlir_doc(NVGPUOps NVGPUOps dialects/ -gen-op-doc)
 add_public_tablegen_target(NVGPUTableGen)

 set(LLVM_TARGET_DEFINITIONS NVGPUAttrDefs.td)
--- a/include/triton/Dialect/NVGPU/IR/NVGPUOps.td
+++ b/include/triton/Dialect/NVGPU/IR/NVGPUOps.td
@@ -97,7 +97,15 @@ def WGMMADesc_ModeAttr : I32EnumAttr<"WGMMADescMode",
 }

 def NVGPU_WGMMADescCreateOp : NVGPU_Op<"wgmma_desc_create", []> {
-  let arguments = (ins LLVM_AnyPointer:$buffer, I32:$height, WGMMADesc_ModeAttr:$mode);
+  let arguments = (ins LLVM_AnyPointer:$buffer, I32:$height, WGMMADesc_ModeAttr:$mode, I64Attr:$swizzling);
+  let builders = [
+    OpBuilder<(ins "Value":$buffer,
+                     "Value":$height,
+                     "WGMMADescMode":$mode), [{
+                      uint32_t mode_ = static_cast<uint32_t>(mode);
+                      uint64_t swizzling = (mode_ == 1 ? 128 : mode_ == 2 ? 64 : 32);
+                      build($_builder, $_state, $_builder.getIntegerType(64), buffer, height, WGMMADescModeAttr::get($_builder.getContext(), mode), $_builder.getI64IntegerAttr(swizzling));
+                     }]>];
  let results = (outs I64:$res);
  let assemblyFormat = "$buffer `,` $height attr-dict `:` functional-type(operands, results)";
 }
@@ -140,12 +148,12 @@ def WGMMA_EltTypeAttr : I32EnumAttr<"WGMMAEltType",
 def WGMMA_OperandType : AnyTypeOf<[LLVM_AnyStruct, I64], "wgmma operand A/B type">;

 def NVGPU_WGMMAOp : NVGPU_Op<"wgmma", []> {
-  let arguments = (ins WGMMA_OperandType:$opA, WGMMA_OperandType:$opB, LLVM_AnyStruct:$opC,
+  let arguments = (ins WGMMA_OperandType:$opA, WGMMA_OperandType:$opB, Optional<LLVM_AnyStruct>:$opC,
                   I32Attr:$m, I32Attr:$n, I32Attr:$k,
                   WGMMA_EltTypeAttr:$eltTypeC, WGMMA_EltTypeAttr:$eltTypeA, WGMMA_EltTypeAttr:$eltTypeB,
                   WGMMA_LayoutAttr:$layoutA, WGMMA_LayoutAttr:$layoutB);
  let results = (outs LLVM_AnyStruct:$res);
-  let assemblyFormat = "$opA `,` $opB `,` $opC attr-dict `:` functional-type(operands, $res)";
+  let assemblyFormat = "$opA `,` $opB (`,` $opC^)? attr-dict `:` functional-type(operands, $res)";
 }

 def NVGPU_CGABarrierSyncOp : NVGPU_Op<"cga_barrier_sync", []> {
--- a/include/triton/Dialect/Triton/IR/CMakeLists.txt
+++ b/include/triton/Dialect/Triton/IR/CMakeLists.txt
@@ -1,12 +1,16 @@
+set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
+
 set(LLVM_TARGET_DEFINITIONS TritonOps.td)
 mlir_tablegen(Ops.h.inc -gen-op-decls)
 mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
 mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
+add_mlir_doc(TritonOps TritonOps dialects/ -gen-op-doc)

 set(LLVM_TARGET_DEFINITIONS TritonDialect.td)
 mlir_tablegen(Dialect.h.inc -gen-dialect-decls)
 mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs)
+add_mlir_doc(TritonDialect TritonDialect dialects/ -gen-dialect-doc)

 set(LLVM_TARGET_DEFINITIONS TritonTypes.td)
 mlir_tablegen(Types.h.inc -gen-typedef-decls)
--- a/include/triton/Dialect/Triton/IR/TritonOps.td
+++ b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -394,7 +394,12 @@ def TT_DotOp : TT_Op<"dot", [Pure,
        $d = matrix_multiply($a, $b) + $c
    }];

-    let arguments = (ins TT_FpIntTensor:$a, TT_FpIntTensor:$b, TT_FpIntTensor:$c, BoolAttr:$allowTF32);
+    let arguments = (ins
+      TT_FpIntTensor:$a,
+      TT_FpIntTensor:$b,
+      TT_FpIntTensor:$c,
+      BoolAttr:$allowTF32,
+      I32Attr:$maxNumImpreciseAcc);

    let results = (outs TT_FpIntTensor:$d);

--- a/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt
+++ b/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt
@@ -1,3 +1,5 @@
+set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
+
 set(LLVM_TARGET_DEFINITIONS TritonGPUOps.td)
 mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_gpu)
 mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_gpu)
@@ -5,6 +7,8 @@ mlir_tablegen(Ops.h.inc -gen-op-decls)
 mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=triton_gpu)
 mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=triton_gpu)
+add_mlir_doc(TritonGPUDialect TritonGPUDialect dialects/ -gen-dialect-doc)
+add_mlir_doc(TritonGPUOps TritonGPUOps dialects/ -gen-op-doc)
 add_public_tablegen_target(TritonGPUTableGen)

 set(LLVM_TARGET_DEFINITIONS TritonGPUAttrDefs.td)
--- a/include/triton/Dialect/TritonGPU/IR/Dialect.h
+++ b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -7,7 +7,6 @@
 #include "mlir/IR/Dialect.h"

 // TritonGPU depends on Triton
-#include "triton/Dialect/NVGPU/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h.inc"
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -113,6 +113,7 @@ compared to 1*64 when the hasLeadingOffset is false.
                     "ArrayRef<unsigned>":$order,
                     "CTALayoutAttr":$CTALayout,
                     "unsigned":$typeWidthInBit), [{
+<<<<<<< HEAD

 #ifdef USE_ROCM
        // ---- begin GFX908/GFX90A ----
@@ -155,6 +156,18 @@ compared to 1*64 when the hasLeadingOffset is false.
          }
        }
 #endif
+=======
+        bool needTrans = false; // default value
+        return get(context, dotOpEnc, shape, order, CTALayout, typeWidthInBit, needTrans);
+    }]>,
+
+    AttrBuilder<(ins "DotOperandEncodingAttr":$dotOpEnc,
+                     "ArrayRef<int64_t>":$shape,
+                     "ArrayRef<unsigned>":$order,
+                     "CTALayoutAttr":$CTALayout,
+                     "unsigned":$typeWidthInBit,
+                     "bool":$needTrans), [{
+>>>>>>> ac9fa68d18c777e421bd3f6fb1ddcfd60b6fda33
        auto mmaEnc = dotOpEnc.getParent().dyn_cast<MmaEncodingAttr>();

        if(!mmaEnc)
@@ -194,16 +207,23 @@ compared to 1*64 when the hasLeadingOffset is false.

          // --- handle A operand ---
          if (opIdx == 0) { // compute swizzling for A operand
-              int vec = (order[0] == 1) ? matShape[2] : matShape[0]; // k : m
-              int mmaStride = (order[0] == 1) ? matShape[0] : matShape[2];
+              int m = (needTrans) ? matShape[2] : matShape[0];
+              int k = (needTrans) ? matShape[0] : matShape[2];
+              int vec = (order[0] == 1) ? k : m;
+              int mmaStride = (order[0] == 1) ? m : k;
              int maxPhase = mmaStride / perPhase;
              return get(context, vec, perPhase, maxPhase, order, CTALayout);
          }

          // --- handle B operand ---
          if (opIdx == 1) {
-              int vec = (order[0] == 1) ? matShape[1] : matShape[2]; // n : k
-              int mmaStride = (order[0] == 1) ? matShape[2] : matShape[1];
+              // we compute vec and maxPhase m, n and k size of the mma
+              // instruction. when matmul operands is transposed, we should
+              // consider that to get m, n and k.
+              int n = needTrans ? matShape[2] : matShape[1];
+              int k = needTrans ? matShape[1] : matShape[2];
+              int vec = (order[0] == 1) ? n : k;
+              int mmaStride = (order[0] == 1) ? k : n;
              int maxPhase = mmaStride / perPhase;
              return get(context, vec, perPhase, maxPhase, order, CTALayout);
          }
@@ -231,6 +251,16 @@ compared to 1*64 when the hasLeadingOffset is false.
      return get(context, dotOpEnc, shape, order, CTALayout, bitwidth);
    }]>,

+    AttrBuilder<(ins "DotOperandEncodingAttr":$dotOpEnc,
+                     "ArrayRef<int64_t>":$shape,
+                     "ArrayRef<unsigned>":$order,
+                     "CTALayoutAttr":$CTALayout,
+                     "Type":$eltTy,
+                     "bool":$needTrans), [{
+      unsigned bitwidth = eltTy.getIntOrFloatBitWidth();
+      return get(context, dotOpEnc, shape, order, CTALayout, bitwidth, needTrans);
+    }]>,
+
    AttrBuilder<(ins "ArrayRef<int64_t>":$shape,
                     "ArrayRef<unsigned>":$order,
                     "CTALayoutAttr":$CTALayout,
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td
@@ -16,7 +16,6 @@ def TritonGPU_Dialect : Dialect {

  let dependentDialects = [
    "triton::TritonDialect",
-    "mlir::triton::nvgpu::NVGPUDialect",
    "mlir::gpu::GPUDialect",
    "tensor::TensorDialect",
  ];
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -28,7 +28,7 @@ def TTG_ConvertLayoutOp : TTG_Op<"convert_layout",

  let results = (outs TT_Tensor:$result);

-  let hasCanonicalizeMethod = 1;
+  let hasCanonicalizer = 1;

  let assemblyFormat = "$src attr-dict `:` functional-type(operands, results)";
 }
--- a/include/triton/Dialect/TritonGPU/Transforms/Utility.h
+++ b/include/triton/Dialect/TritonGPU/Transforms/Utility.h
@@ -141,13 +141,6 @@ Value linearize(OpBuilder &b, Location loc, ArrayRef<Value> multiDim,
 Value linearize(OpBuilder &b, Location loc, ArrayRef<Value> multiDim,
                ArrayRef<unsigned> shape);

-// Returns null if the op is not inside a agent region (warp specialization
-// mode). Note that there should be at most one agent id attached to the
-// operation.
-std::optional<int> getWSAgentId(Operation *op);
-std::optional<int> getWSRoleId(Operation *op);
-void setRoleId(Operation *op, int roleId);
-
 } // namespace mlir

 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
--- a/include/triton/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt
+++ b/include/triton/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt
@@ -1,3 +1,5 @@
+set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
+
 set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUOps.td)
 mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_nvidia_gpu)
 mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_nvidia_gpu)
@@ -5,6 +7,8 @@ mlir_tablegen(Ops.h.inc -gen-op-decls)
 mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=triton_nvidia_gpu)
 mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=triton_nvidia_gpu)
+add_mlir_doc(TritonNvidiaGPUDialect TritonNvidiaGPUDialect dialects/ -gen-dialect-doc)
+add_mlir_doc(TritonNvidiaGPUOps TritonNvidiaGPUOps dialects/ -gen-op-doc)
 add_public_tablegen_target(TritonNvidiaGPUTableGen)

 set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUAttrDefs.td)
--- a/include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h
+++ b/include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h
@@ -30,7 +30,6 @@
 #include "mlir/IR/Dialect.h"

 // TritonNvidiaGPU depends on Triton
-#include "triton/Dialect/NVGPU/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Traits.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h.inc"
@@ -43,4 +42,15 @@
 #define GET_OP_CLASSES
 #include "triton/Dialect/TritonNvidiaGPU/IR/Ops.h.inc"

+namespace mlir {
+
+// Returns null if the op is not inside a agent region (warp specialization
+// mode). Note that there should be at most one agent id attached to the
+// operation.
+std::optional<int> getWSAgentId(Operation *op);
+std::optional<int> getWSRoleId(Operation *op);
+void setRoleId(Operation *op, int roleId);
+
+} // namespace mlir
+
 #endif // TRITON_DIALECT_TRITONNVIDIAGPU_IR_DIALECT_H_
--- a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUDialect.td
+++ b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUDialect.td
@@ -38,7 +38,6 @@ def TritonNvidiaGPU_Dialect : Dialect {
  let dependentDialects = [
    "triton::TritonDialect",
    "triton::gpu::TritonGPUDialect",
-    "mlir::triton::nvgpu::NVGPUDialect",
    "mlir::gpu::GPUDialect",
    "tensor::TensorDialect",
  ];
--- a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
+++ b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
@@ -258,7 +258,11 @@ def TTNG_DotAsyncOp : TTNG_Op<"dot_async", [Pure,
        $d = matrix_multiply($a, $b) + $c
    }];

-    let arguments = (ins TT_FpIntTensor:$a, TT_FpIntTensor:$b, TT_FpIntTensor:$c, BoolAttr:$allowTF32);
+    let arguments = (ins TT_FpIntTensor:$a,
+                         TT_FpIntTensor:$b,
+                         TT_FpIntTensor:$c,
+                         BoolAttr:$allowTF32,
+                         I32Attr:$maxNumImpreciseAcc);

    let results = (outs TT_FpIntTensor:$d);

--- a/include/triton/Tools/Sys/GetEnv.hpp
+++ b/include/triton/Tools/Sys/GetEnv.hpp
@@ -30,7 +30,7 @@
 namespace triton {

 const std::set<std::string> ENV_VARS = {
-    "ENABLE_MMA_V3",     "TRITON_DISABLE_LINE_INFO", "DISABLE_FAST_REDUCTION",
+    "DISABLE_MMA_V3",    "TRITON_DISABLE_LINE_INFO", "DISABLE_FAST_REDUCTION",
    "ENABLE_TMA",        "MLIR_ENABLE_DUMP",         "LLVM_IR_ENABLE_DUMP",
    "AMDGCN_ENABLE_DUMP"};