mirror of
https://github.com/ROCm/ROCm.git
synced 2026-04-05 03:01:17 -04:00
Merge commit 'cb3d79a185e40c9d8a579bea07747a8a8d157d52' into ifu-231117
Conflicts: lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp lib/Dialect/TritonGPU/IR/Dialect.cpp python/setup.py python/test/unit/language/assert_helper.py python/test/unit/operators/test_flash_attention.py python/test/unit/runtime/test_subproc.py python/triton/compiler/compiler.py python/triton/language/semantic.py python/triton/runtime/autotuner.py python/triton/runtime/jit.py python/tutorials/03-matrix-multiplication.py python/tutorials/05-layer-norm.py python/tutorials/06-fused-attention.py python/tutorials/11-grouped-gemm.py test/Conversion/tritongpu_to_llvm.mlir
This commit is contained in:
@@ -66,6 +66,16 @@ def TT_AtomicRMWAttr : I32EnumAttr<
|
||||
let cppNamespace = "::mlir::triton";
|
||||
}
|
||||
|
||||
def TT_MemSyncScopeAttr : I32EnumAttr<
|
||||
"MemSyncScope", "",
|
||||
[
|
||||
I32EnumAttrCase<"GPU", 1, "gpu">,
|
||||
I32EnumAttrCase<"CTA", 2, "cta">,
|
||||
I32EnumAttrCase<"SYSTEM", 3, "sys">,
|
||||
]> {
|
||||
let cppNamespace = "::mlir::triton";
|
||||
}
|
||||
|
||||
// Program ID dimensions.
|
||||
def TT_ProgramDim : I32EnumAttr<
|
||||
"ProgramIDDim", "",
|
||||
|
||||
@@ -242,7 +242,7 @@ def TT_AtomicRMWOp : TT_Op<"atomic_rmw", [SameOperandsAndResultShape,
|
||||
|
||||
let arguments = (ins TT_AtomicRMWAttr:$atomic_rmw_op, TT_PtrLike:$ptr,
|
||||
TT_Type:$val, Optional<TT_BoolLike>:$mask,
|
||||
TT_MemSemanticAttr:$sem);
|
||||
TT_MemSemanticAttr:$sem, TT_MemSyncScopeAttr:$scope);
|
||||
|
||||
let results = (outs TT_Type:$result);
|
||||
}
|
||||
@@ -264,7 +264,7 @@ def TT_AtomicCASOp : TT_Op<"atomic_cas", [MemoryEffects<[MemRead]>,
|
||||
}];
|
||||
|
||||
let arguments = (ins TT_PtrLike:$ptr, TT_Type:$cmp, TT_Type:$val,
|
||||
TT_MemSemanticAttr:$sem);
|
||||
TT_MemSemanticAttr:$sem, TT_MemSyncScopeAttr:$scope);
|
||||
|
||||
let results = (outs TT_Type:$result);
|
||||
}
|
||||
|
||||
@@ -109,6 +109,9 @@ bool isSharedEncoding(Value value);
|
||||
|
||||
bool isExpensiveCat(CatOp cat, Attribute targetEncoding);
|
||||
|
||||
// Return true if a view between the two types cannot be implemented as a no-op.
|
||||
bool isExpensiveView(Type srcType, Type dstType);
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace triton
|
||||
} // namespace mlir
|
||||
|
||||
@@ -85,54 +85,6 @@ def TTG_AsyncBulkCommitGroupOp : TTG_Op<"async_bulk_commit_group"> {
|
||||
}];
|
||||
}
|
||||
|
||||
|
||||
// Port Arith_CmpIOp & Arith_CmpFOp & Std_SelectOp to TritonGPU.
|
||||
// This is needed because these ops don't
|
||||
// handle encodings
|
||||
// e.g., https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td#L111
|
||||
def TTG_CmpIOp : TTG_Op<"cmpi", [Pure, Elementwise,
|
||||
SameOperandsAndResultShape,
|
||||
SameOperandsAndResultEncoding]> {
|
||||
let summary = "integer comparison operation";
|
||||
|
||||
let description = [{}];
|
||||
|
||||
let arguments = (ins Arith_CmpIPredicateAttr:$predicate,
|
||||
TT_IntLike:$lhs,
|
||||
TT_IntLike:$rhs);
|
||||
|
||||
let results = (outs TT_BoolLike:$result);
|
||||
}
|
||||
|
||||
def TTG_CmpFOp : TTG_Op<"cmpf", [Pure, Elementwise,
|
||||
SameOperandsAndResultShape,
|
||||
SameOperandsAndResultEncoding]> {
|
||||
let summary = "floating-point comparison operation";
|
||||
|
||||
let description = [{}];
|
||||
|
||||
let arguments = (ins Arith_CmpFPredicateAttr:$predicate,
|
||||
TT_FloatLike:$lhs,
|
||||
TT_FloatLike:$rhs);
|
||||
|
||||
let results = (outs TT_BoolLike:$result);
|
||||
}
|
||||
|
||||
// TODO: migrate to arith::SelectOp on LLVM16
|
||||
def TTG_SelectOp : TTG_Op<"select", [Pure, Elementwise,
|
||||
SameOperandsAndResultShape,
|
||||
SameOperandsAndResultEncoding]> {
|
||||
let summary = "select operation";
|
||||
|
||||
let description = [{}];
|
||||
|
||||
let arguments = (ins TT_BoolLike:$condition,
|
||||
TT_Tensor:$true_value,
|
||||
TT_Tensor:$false_value);
|
||||
|
||||
let results = (outs TT_Type:$result);
|
||||
}
|
||||
|
||||
// TODO[goostavz]: extract a base class for InsertSlice & InsertSliceAsync once the op definition is verified
|
||||
def TTG_InsertSliceOp : TTG_Op<"insert_slice",
|
||||
[AttrSizedOperandSegments,
|
||||
|
||||
@@ -38,6 +38,8 @@ std::unique_ptr<Pass> createTritonGPUOptimizeDotOperandsPass();
|
||||
|
||||
std::unique_ptr<Pass> createTritonGPUOptimizeEpiloguePass();
|
||||
|
||||
std::unique_ptr<Pass> createTritonGPUOptimizeThreadLocalityPass();
|
||||
|
||||
/// Generate the code for registering passes.
|
||||
#define GEN_PASS_REGISTRATION
|
||||
#include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
|
||||
|
||||
@@ -162,6 +162,19 @@ def TritonGPUOptimizeEpilogue : Pass<"tritongpu-optimize-epilogue", "mlir::Modul
|
||||
|
||||
}
|
||||
|
||||
def TritonGPUOptimizeThreadLocality : Pass<"tritongpu-optimize-thread-locality", "mlir::ModuleOp"> {
|
||||
let summary = "Reduce the cost of synchronization between threads in an SM";
|
||||
|
||||
let description = [{
|
||||
Today, this optimizes reduction yielded by loop to be thread-local until after the loop completes.
|
||||
}];
|
||||
|
||||
let constructor = "mlir::createTritonGPUOptimizeThreadLocalityPass()";
|
||||
|
||||
let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
|
||||
"mlir::triton::TritonDialect"];
|
||||
}
|
||||
|
||||
def TritonGPUReorderInstructions: Pass<"tritongpu-reorder-instructions", "mlir::ModuleOp"> {
|
||||
let summary = "Reorder instructions";
|
||||
|
||||
|
||||
@@ -111,6 +111,11 @@ bool isExpensiveLoadOrStore(Operation *op);
|
||||
|
||||
bool canFoldIntoConversion(Operation *op, Attribute targetEncoding);
|
||||
|
||||
// Replace ForOp with a new ForOp with extra operands. The YieldOp is not
|
||||
// updated and needs to be updated separatly for the loop to be correct.
|
||||
scf::ForOp replaceForOpWithNewSignature(OpBuilder &rewriter, scf::ForOp loop,
|
||||
ValueRange newIterOperands);
|
||||
|
||||
Operation *cloneWithInferType(mlir::OpBuilder &rewriter, Operation *op,
|
||||
IRMapping &mapping);
|
||||
|
||||
@@ -140,7 +145,6 @@ Value linearize(OpBuilder &b, Location loc, ArrayRef<Value> multiDim,
|
||||
|
||||
Value linearize(OpBuilder &b, Location loc, ArrayRef<Value> multiDim,
|
||||
ArrayRef<unsigned> shape);
|
||||
|
||||
} // namespace mlir
|
||||
|
||||
#endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
|
||||
|
||||
@@ -270,16 +270,16 @@ def TTNG_DotAsyncOp : TTNG_Op<"dot_async", [Pure,
|
||||
}
|
||||
|
||||
def TTNG_DotWaitOp : TTNG_Op<"dot_wait", [DeclareOpInterfaceMethods<InferTypeOpInterface>,
|
||||
AllTypesMatch<["input", "output"]>]> {
|
||||
AllTypesMatch<["inputs", "outputs"]>]> {
|
||||
let summary = "dot wait";
|
||||
let arguments = (ins TT_FpIntTensor:$input, I32Attr:$pendings);
|
||||
let results = (outs TT_FpIntTensor:$output);
|
||||
let arguments = (ins Variadic<TT_FpIntTensor>:$inputs, I32Attr:$pendings);
|
||||
let results = (outs Variadic<TT_FpIntTensor>:$outputs);
|
||||
let description = [{
|
||||
This operation defining the waiting action for a async dot, MMAv3 .e.g.
|
||||
The subsequent operations should not execute until this operation completes waiting.
|
||||
}];
|
||||
|
||||
let assemblyFormat = "$input attr-dict `:` type($input)";
|
||||
let assemblyFormat = "$inputs attr-dict `:` type($inputs)";
|
||||
}
|
||||
|
||||
def TTNG_StoreAsyncOp : TTNG_Op<"store_async",
|
||||
|
||||
Reference in New Issue
Block a user