Merge commit 'cb3d79a185e40c9d8a579bea07747a8a8d157d52' into ifu-231117

Conflicts: lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp lib/Dialect/TritonGPU/IR/Dialect.cpp python/setup.py python/test/unit/language/assert_helper.py python/test/unit/operators/test_flash_attention.py python/test/unit/runtime/test_subproc.py python/triton/compiler/compiler.py python/triton/language/semantic.py python/triton/runtime/autotuner.py python/triton/runtime/jit.py python/tutorials/03-matrix-multiplication.py python/tutorials/05-layer-norm.py python/tutorials/06-fused-attention.py python/tutorials/11-grouped-gemm.py test/Conversion/tritongpu_to_llvm.mlir
2026-04-05 03:01:17 -04:00 · 2023-11-17 20:42:12 +00:00
parent e1513b34e1 cb3d79a185
commit 5c87f363e4
179 changed files with 10116 additions and 6835 deletions
--- a/include/triton/Dialect/Triton/IR/TritonAttrDefs.td
+++ b/include/triton/Dialect/Triton/IR/TritonAttrDefs.td
@@ -66,6 +66,16 @@ def TT_AtomicRMWAttr : I32EnumAttr<
    let cppNamespace = "::mlir::triton";
 }

+def TT_MemSyncScopeAttr : I32EnumAttr<
+    "MemSyncScope", "",
+    [
+      I32EnumAttrCase<"GPU", 1, "gpu">,
+      I32EnumAttrCase<"CTA", 2, "cta">,
+      I32EnumAttrCase<"SYSTEM", 3, "sys">,
+    ]> {
+    let cppNamespace = "::mlir::triton";
+}
+
 // Program ID dimensions.
 def TT_ProgramDim : I32EnumAttr<
    "ProgramIDDim", "",
--- a/include/triton/Dialect/Triton/IR/TritonOps.td
+++ b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -242,7 +242,7 @@ def TT_AtomicRMWOp : TT_Op<"atomic_rmw", [SameOperandsAndResultShape,

    let arguments = (ins TT_AtomicRMWAttr:$atomic_rmw_op, TT_PtrLike:$ptr,
                         TT_Type:$val, Optional<TT_BoolLike>:$mask,
-                         TT_MemSemanticAttr:$sem);
+                         TT_MemSemanticAttr:$sem, TT_MemSyncScopeAttr:$scope);

    let results = (outs TT_Type:$result);
 }
@@ -264,7 +264,7 @@ def TT_AtomicCASOp : TT_Op<"atomic_cas", [MemoryEffects<[MemRead]>,
    }];

    let arguments = (ins TT_PtrLike:$ptr, TT_Type:$cmp, TT_Type:$val,
-                     TT_MemSemanticAttr:$sem);
+                     TT_MemSemanticAttr:$sem, TT_MemSyncScopeAttr:$scope);

    let results = (outs TT_Type:$result);
 }
--- a/include/triton/Dialect/TritonGPU/IR/Dialect.h
+++ b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -109,6 +109,9 @@ bool isSharedEncoding(Value value);

 bool isExpensiveCat(CatOp cat, Attribute targetEncoding);

+// Return true if a view between the two types cannot be implemented as a no-op.
+bool isExpensiveView(Type srcType, Type dstType);
+
 } // namespace gpu
 } // namespace triton
 } // namespace mlir
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -85,54 +85,6 @@ def TTG_AsyncBulkCommitGroupOp : TTG_Op<"async_bulk_commit_group"> {
  }];
 }

-
-// Port Arith_CmpIOp & Arith_CmpFOp & Std_SelectOp to TritonGPU.
-// This is needed because these ops don't
-// handle encodings
-// e.g., https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td#L111
-def TTG_CmpIOp : TTG_Op<"cmpi", [Pure, Elementwise,
-                                 SameOperandsAndResultShape,
-                                 SameOperandsAndResultEncoding]> {
-  let summary = "integer comparison operation";
-
-  let description = [{}];
-
-  let arguments = (ins Arith_CmpIPredicateAttr:$predicate,
-                       TT_IntLike:$lhs,
-                       TT_IntLike:$rhs);
-
-  let results = (outs TT_BoolLike:$result);
-}
-
-def TTG_CmpFOp : TTG_Op<"cmpf", [Pure, Elementwise,
-                                 SameOperandsAndResultShape,
-                                 SameOperandsAndResultEncoding]> {
-  let summary = "floating-point comparison operation";
-
-  let description = [{}];
-
-  let arguments = (ins Arith_CmpFPredicateAttr:$predicate,
-                       TT_FloatLike:$lhs,
-                       TT_FloatLike:$rhs);
-
-  let results = (outs TT_BoolLike:$result);
-}
-
-// TODO: migrate to arith::SelectOp on LLVM16
-def TTG_SelectOp : TTG_Op<"select", [Pure, Elementwise,
-                                     SameOperandsAndResultShape,
-                                     SameOperandsAndResultEncoding]> {
-  let summary = "select operation";
-
-  let description = [{}];
-
-  let arguments = (ins TT_BoolLike:$condition,
-                       TT_Tensor:$true_value,
-                       TT_Tensor:$false_value);
-
-  let results = (outs TT_Type:$result);
-}
-
 // TODO[goostavz]: extract a base class for InsertSlice & InsertSliceAsync once the op definition is verified
 def TTG_InsertSliceOp : TTG_Op<"insert_slice",
                               [AttrSizedOperandSegments,
--- a/include/triton/Dialect/TritonGPU/Transforms/Passes.h
+++ b/include/triton/Dialect/TritonGPU/Transforms/Passes.h
@@ -38,6 +38,8 @@ std::unique_ptr<Pass> createTritonGPUOptimizeDotOperandsPass();

 std::unique_ptr<Pass> createTritonGPUOptimizeEpiloguePass();

+std::unique_ptr<Pass> createTritonGPUOptimizeThreadLocalityPass();
+
 /// Generate the code for registering passes.
 #define GEN_PASS_REGISTRATION
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
--- a/include/triton/Dialect/TritonGPU/Transforms/Passes.td
+++ b/include/triton/Dialect/TritonGPU/Transforms/Passes.td
@@ -162,6 +162,19 @@ def TritonGPUOptimizeEpilogue : Pass<"tritongpu-optimize-epilogue", "mlir::Modul

 }

+def TritonGPUOptimizeThreadLocality : Pass<"tritongpu-optimize-thread-locality", "mlir::ModuleOp"> {
+  let summary = "Reduce the cost of synchronization between threads in an SM";
+
+  let description = [{
+    Today, this optimizes reduction yielded by loop to be thread-local until after the loop completes.
+  }];
+
+  let constructor = "mlir::createTritonGPUOptimizeThreadLocalityPass()";
+
+  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
+                           "mlir::triton::TritonDialect"];
+}
+
 def TritonGPUReorderInstructions: Pass<"tritongpu-reorder-instructions", "mlir::ModuleOp"> {
  let summary = "Reorder instructions";

--- a/include/triton/Dialect/TritonGPU/Transforms/Utility.h
+++ b/include/triton/Dialect/TritonGPU/Transforms/Utility.h
@@ -111,6 +111,11 @@ bool isExpensiveLoadOrStore(Operation *op);

 bool canFoldIntoConversion(Operation *op, Attribute targetEncoding);

+// Replace ForOp with a new ForOp with extra operands. The YieldOp is not
+// updated and needs to be updated separatly for the loop to be correct.
+scf::ForOp replaceForOpWithNewSignature(OpBuilder &rewriter, scf::ForOp loop,
+                                        ValueRange newIterOperands);
+
 Operation *cloneWithInferType(mlir::OpBuilder &rewriter, Operation *op,
                              IRMapping &mapping);

@@ -140,7 +145,6 @@ Value linearize(OpBuilder &b, Location loc, ArrayRef<Value> multiDim,

 Value linearize(OpBuilder &b, Location loc, ArrayRef<Value> multiDim,
                ArrayRef<unsigned> shape);
-
 } // namespace mlir

 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
--- a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
+++ b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
@@ -270,16 +270,16 @@ def TTNG_DotAsyncOp : TTNG_Op<"dot_async", [Pure,
 }

 def TTNG_DotWaitOp : TTNG_Op<"dot_wait", [DeclareOpInterfaceMethods<InferTypeOpInterface>,
-                                          AllTypesMatch<["input", "output"]>]> {
+                                          AllTypesMatch<["inputs", "outputs"]>]> {
  let summary = "dot wait";
-  let arguments = (ins TT_FpIntTensor:$input, I32Attr:$pendings);
-  let results = (outs TT_FpIntTensor:$output);
+  let arguments = (ins Variadic<TT_FpIntTensor>:$inputs, I32Attr:$pendings);
+  let results = (outs Variadic<TT_FpIntTensor>:$outputs);
  let description = [{
    This operation defining the waiting action for a async dot, MMAv3 .e.g.
    The subsequent operations should not execute until this operation completes waiting.
  }];

-  let assemblyFormat = "$input attr-dict `:` type($input)";
+  let assemblyFormat = "$inputs attr-dict `:` type($inputs)";
 }

 def TTNG_StoreAsyncOp : TTNG_Op<"store_async",