Merge commit 'cb3d79a185e40c9d8a579bea07747a8a8d157d52' into ifu-231117

Conflicts:
	lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp
	lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp
	lib/Dialect/TritonGPU/IR/Dialect.cpp
	python/setup.py
	python/test/unit/language/assert_helper.py
	python/test/unit/operators/test_flash_attention.py
	python/test/unit/runtime/test_subproc.py
	python/triton/compiler/compiler.py
	python/triton/language/semantic.py
	python/triton/runtime/autotuner.py
	python/triton/runtime/jit.py
	python/tutorials/03-matrix-multiplication.py
	python/tutorials/05-layer-norm.py
	python/tutorials/06-fused-attention.py
	python/tutorials/11-grouped-gemm.py
	test/Conversion/tritongpu_to_llvm.mlir
This commit is contained in:
Jason Furmanek
2023-11-17 20:42:12 +00:00
179 changed files with 10116 additions and 6835 deletions

View File

@@ -66,6 +66,16 @@ def TT_AtomicRMWAttr : I32EnumAttr<
let cppNamespace = "::mlir::triton";
}
def TT_MemSyncScopeAttr : I32EnumAttr<
"MemSyncScope", "",
[
I32EnumAttrCase<"GPU", 1, "gpu">,
I32EnumAttrCase<"CTA", 2, "cta">,
I32EnumAttrCase<"SYSTEM", 3, "sys">,
]> {
let cppNamespace = "::mlir::triton";
}
// Program ID dimensions.
def TT_ProgramDim : I32EnumAttr<
"ProgramIDDim", "",

View File

@@ -242,7 +242,7 @@ def TT_AtomicRMWOp : TT_Op<"atomic_rmw", [SameOperandsAndResultShape,
let arguments = (ins TT_AtomicRMWAttr:$atomic_rmw_op, TT_PtrLike:$ptr,
TT_Type:$val, Optional<TT_BoolLike>:$mask,
TT_MemSemanticAttr:$sem);
TT_MemSemanticAttr:$sem, TT_MemSyncScopeAttr:$scope);
let results = (outs TT_Type:$result);
}
@@ -264,7 +264,7 @@ def TT_AtomicCASOp : TT_Op<"atomic_cas", [MemoryEffects<[MemRead]>,
}];
let arguments = (ins TT_PtrLike:$ptr, TT_Type:$cmp, TT_Type:$val,
TT_MemSemanticAttr:$sem);
TT_MemSemanticAttr:$sem, TT_MemSyncScopeAttr:$scope);
let results = (outs TT_Type:$result);
}

View File

@@ -109,6 +109,9 @@ bool isSharedEncoding(Value value);
bool isExpensiveCat(CatOp cat, Attribute targetEncoding);
// Return true if a view between the two types cannot be implemented as a no-op.
bool isExpensiveView(Type srcType, Type dstType);
} // namespace gpu
} // namespace triton
} // namespace mlir

View File

@@ -85,54 +85,6 @@ def TTG_AsyncBulkCommitGroupOp : TTG_Op<"async_bulk_commit_group"> {
}];
}
// Port Arith_CmpIOp & Arith_CmpFOp & Std_SelectOp to TritonGPU.
// This is needed because these ops don't
// handle encodings
// e.g., https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td#L111
def TTG_CmpIOp : TTG_Op<"cmpi", [Pure, Elementwise,
SameOperandsAndResultShape,
SameOperandsAndResultEncoding]> {
let summary = "integer comparison operation";
let description = [{}];
let arguments = (ins Arith_CmpIPredicateAttr:$predicate,
TT_IntLike:$lhs,
TT_IntLike:$rhs);
let results = (outs TT_BoolLike:$result);
}
def TTG_CmpFOp : TTG_Op<"cmpf", [Pure, Elementwise,
SameOperandsAndResultShape,
SameOperandsAndResultEncoding]> {
let summary = "floating-point comparison operation";
let description = [{}];
let arguments = (ins Arith_CmpFPredicateAttr:$predicate,
TT_FloatLike:$lhs,
TT_FloatLike:$rhs);
let results = (outs TT_BoolLike:$result);
}
// TODO: migrate to arith::SelectOp on LLVM16
def TTG_SelectOp : TTG_Op<"select", [Pure, Elementwise,
SameOperandsAndResultShape,
SameOperandsAndResultEncoding]> {
let summary = "select operation";
let description = [{}];
let arguments = (ins TT_BoolLike:$condition,
TT_Tensor:$true_value,
TT_Tensor:$false_value);
let results = (outs TT_Type:$result);
}
// TODO[goostavz]: extract a base class for InsertSlice & InsertSliceAsync once the op definition is verified
def TTG_InsertSliceOp : TTG_Op<"insert_slice",
[AttrSizedOperandSegments,

View File

@@ -38,6 +38,8 @@ std::unique_ptr<Pass> createTritonGPUOptimizeDotOperandsPass();
std::unique_ptr<Pass> createTritonGPUOptimizeEpiloguePass();
std::unique_ptr<Pass> createTritonGPUOptimizeThreadLocalityPass();
/// Generate the code for registering passes.
#define GEN_PASS_REGISTRATION
#include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"

View File

@@ -162,6 +162,19 @@ def TritonGPUOptimizeEpilogue : Pass<"tritongpu-optimize-epilogue", "mlir::Modul
}
def TritonGPUOptimizeThreadLocality : Pass<"tritongpu-optimize-thread-locality", "mlir::ModuleOp"> {
let summary = "Reduce the cost of synchronization between threads in an SM";
let description = [{
Today, this optimizes reduction yielded by loop to be thread-local until after the loop completes.
}];
let constructor = "mlir::createTritonGPUOptimizeThreadLocalityPass()";
let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
"mlir::triton::TritonDialect"];
}
def TritonGPUReorderInstructions: Pass<"tritongpu-reorder-instructions", "mlir::ModuleOp"> {
let summary = "Reorder instructions";

View File

@@ -111,6 +111,11 @@ bool isExpensiveLoadOrStore(Operation *op);
bool canFoldIntoConversion(Operation *op, Attribute targetEncoding);
// Replace ForOp with a new ForOp with extra operands. The YieldOp is not
// updated and needs to be updated separatly for the loop to be correct.
scf::ForOp replaceForOpWithNewSignature(OpBuilder &rewriter, scf::ForOp loop,
ValueRange newIterOperands);
Operation *cloneWithInferType(mlir::OpBuilder &rewriter, Operation *op,
IRMapping &mapping);
@@ -140,7 +145,6 @@ Value linearize(OpBuilder &b, Location loc, ArrayRef<Value> multiDim,
Value linearize(OpBuilder &b, Location loc, ArrayRef<Value> multiDim,
ArrayRef<unsigned> shape);
} // namespace mlir
#endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_

View File

@@ -270,16 +270,16 @@ def TTNG_DotAsyncOp : TTNG_Op<"dot_async", [Pure,
}
def TTNG_DotWaitOp : TTNG_Op<"dot_wait", [DeclareOpInterfaceMethods<InferTypeOpInterface>,
AllTypesMatch<["input", "output"]>]> {
AllTypesMatch<["inputs", "outputs"]>]> {
let summary = "dot wait";
let arguments = (ins TT_FpIntTensor:$input, I32Attr:$pendings);
let results = (outs TT_FpIntTensor:$output);
let arguments = (ins Variadic<TT_FpIntTensor>:$inputs, I32Attr:$pendings);
let results = (outs Variadic<TT_FpIntTensor>:$outputs);
let description = [{
This operation defining the waiting action for a async dot, MMAv3 .e.g.
The subsequent operations should not execute until this operation completes waiting.
}];
let assemblyFormat = "$input attr-dict `:` type($input)";
let assemblyFormat = "$inputs attr-dict `:` type($inputs)";
}
def TTNG_StoreAsyncOp : TTNG_Op<"store_async",