Merge commit 'ac9fa68d18c777e421bd3f6fb1ddcfd60b6fda33' into ifu-rebase-again

Conflicts:
	.gitignore
	.gitmodules
	README.md
	bin/triton-translate.cpp
	include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
	include/triton/Target/AMDGCN/AMDGCNTranslation.h
	include/triton/Target/HSACO/HSACOTranslation.h
	lib/Analysis/Allocation.cpp
	lib/Analysis/Utility.cpp
	lib/Conversion/TritonGPUToLLVM/CMakeLists.txt
	lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
	lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
	lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp
	lib/Conversion/TritonGPUToLLVM/Utility.cpp
	lib/Conversion/TritonGPUToLLVM/Utility.h
	lib/Dialect/TritonGPU/IR/Dialect.cpp
	lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
	lib/Target/HSACO/CMakeLists.txt
	lib/Target/HSACO/HSACOTranslation.cpp
	lib/Target/LLVMIR/LLVMIRTranslation.cpp
	python/src/triton.cc
	python/test/unit/language/test_core.py
	python/test/unit/operators/test_flash_attention.py
	python/triton/compiler/compiler.py
	python/triton/compiler/make_launcher.py
	python/triton/language/semantic.py
	python/triton/runtime/jit.py
	python/tutorials/06-fused-attention.py
	python/tutorials/11-grouped-gemm.py
	test/Conversion/tritongpu_to_llvm.mlir
This commit is contained in:
Jason Furmanek
2023-11-06 23:10:10 +00:00
161 changed files with 6530 additions and 3905 deletions

View File

@@ -21,6 +21,7 @@ class AllocationAnalysis;
SmallVector<unsigned>
getScratchConfigForCvtLayout(triton::gpu::ConvertLayoutOp op, unsigned &inVec,
unsigned &outVec);
SmallVector<unsigned> getRepShapeForCvtLayout(triton::gpu::ConvertLayoutOp op);
} // namespace triton

View File

@@ -36,7 +36,9 @@ public:
triton::ReduceOp getOperation() { return op; }
bool isFastReduction();
bool isReductionOnLayoutFastAxis();
unsigned getThreadOffsetOnReductionAxis();
bool isWarpSynchronous();
@@ -50,14 +52,16 @@ public:
unsigned getThreadsReductionAxis();
SmallVector<unsigned> getScratchConfigBasic();
SmallVector<SmallVector<unsigned>> getScratchConfigsFast();
SmallVector<unsigned> getScratchConfig();
unsigned getScratchSizeInBytes();
bool isSupportedLayout();
bool isReduceWithinCTA();
unsigned getAxis() { return axis; }
private:
triton::ReduceOp op;
ArrayRef<int64_t> srcShape;
@@ -84,8 +88,12 @@ public:
unsigned getNonAxisNumThreadsPerCTA();
// Return the number of warps per CTA along axis dim.
unsigned getAxisNumWarps();
// Return the number of warps per CTA along axis dim with unique data.
unsigned getAxisNumWarpsWithUniqueData();
// Return the number of threads per warp along axis dim.
unsigned getAxisNumThreadsPerWarp();
// Return the number of threads per warp along axis dim with unique data.
unsigned getAxisNumThreadsPerWarpWithUniqueData();
// Return the number of blocks along axis dim.
unsigned getAxisNumBlocks();
// Return the number of blocks along non axis dim.
@@ -103,6 +111,7 @@ public:
Location getLoc() { return scanOp.getLoc(); }
unsigned getAxis() { return scanOp.getAxis(); }
triton::gpu::BlockedEncodingAttr getEncoding();
llvm::ArrayRef<int64_t> getShape();
Region &getCombineOp();
private:
@@ -128,6 +137,10 @@ bool isMmaToDotShortcut(RankedTensorType &srcTy, RankedTensorType &dstTy);
bool isMmaToMmaShortcut(RankedTensorType &srcTy, RankedTensorType &dstTy);
// Return true if the src and dst layout match.
bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
RankedTensorType dstTy);
// TODO: Move utility functions that belong to ConvertLayoutOp to class
// ConvertLayoutOpHelper in the future
bool shouldUseDistSmem(Attribute srcLayout, Attribute dstLayout);

View File

@@ -27,9 +27,6 @@ def ConvertTritonGPUToLLVM : Pass<"convert-triton-gpu-to-llvm", "mlir::ModuleOp"
Option<"computeCapability", "compute-capability",
"int32_t", /*default*/"80",
"device compute capability">,
Option<"tmaMetadata", "tma-metadata",
"mlir::triton::gpu::TMAMetadataTy*", /*default*/"nullptr",
"tma metadata to the runtime">,
Option<"target", "target", "enum Target", "mlir::triton::Target::Default",
"compile for target compatible LLVM",
"llvm::cl::values("

View File

@@ -21,7 +21,8 @@ enum Target { NVVM, ROCDL, Default = NVVM };
std::unique_ptr<OperationPass<ModuleOp>> createConvertTritonGPUToLLVMPass();
std::unique_ptr<OperationPass<ModuleOp>>
createConvertTritonGPUToLLVMPass(const ConvertTritonGPUToLLVMOptions &options);
createConvertTritonGPUToLLVMPass(int32_t computeCapability, Target target,
mlir::triton::gpu::TMAMetadataTy *tmaMetadata);
} // namespace triton

View File

@@ -1,3 +1,5 @@
set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
set(LLVM_TARGET_DEFINITIONS NVGPUOps.td)
mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=nvgpu)
mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=nvgpu)
@@ -6,6 +8,8 @@ mlir_tablegen(Ops.h.inc -gen-op-decls)
mlir_tablegen(Ops.cpp.inc -gen-op-defs)
mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
add_mlir_doc(NVGPUDialect NVGPUDialect dialects/ -gen-dialect-doc)
add_mlir_doc(NVGPUOps NVGPUOps dialects/ -gen-op-doc)
add_public_tablegen_target(NVGPUTableGen)
set(LLVM_TARGET_DEFINITIONS NVGPUAttrDefs.td)

View File

@@ -97,7 +97,15 @@ def WGMMADesc_ModeAttr : I32EnumAttr<"WGMMADescMode",
}
def NVGPU_WGMMADescCreateOp : NVGPU_Op<"wgmma_desc_create", []> {
let arguments = (ins LLVM_AnyPointer:$buffer, I32:$height, WGMMADesc_ModeAttr:$mode);
let arguments = (ins LLVM_AnyPointer:$buffer, I32:$height, WGMMADesc_ModeAttr:$mode, I64Attr:$swizzling);
let builders = [
OpBuilder<(ins "Value":$buffer,
"Value":$height,
"WGMMADescMode":$mode), [{
uint32_t mode_ = static_cast<uint32_t>(mode);
uint64_t swizzling = (mode_ == 1 ? 128 : mode_ == 2 ? 64 : 32);
build($_builder, $_state, $_builder.getIntegerType(64), buffer, height, WGMMADescModeAttr::get($_builder.getContext(), mode), $_builder.getI64IntegerAttr(swizzling));
}]>];
let results = (outs I64:$res);
let assemblyFormat = "$buffer `,` $height attr-dict `:` functional-type(operands, results)";
}
@@ -140,12 +148,12 @@ def WGMMA_EltTypeAttr : I32EnumAttr<"WGMMAEltType",
def WGMMA_OperandType : AnyTypeOf<[LLVM_AnyStruct, I64], "wgmma operand A/B type">;
def NVGPU_WGMMAOp : NVGPU_Op<"wgmma", []> {
let arguments = (ins WGMMA_OperandType:$opA, WGMMA_OperandType:$opB, LLVM_AnyStruct:$opC,
let arguments = (ins WGMMA_OperandType:$opA, WGMMA_OperandType:$opB, Optional<LLVM_AnyStruct>:$opC,
I32Attr:$m, I32Attr:$n, I32Attr:$k,
WGMMA_EltTypeAttr:$eltTypeC, WGMMA_EltTypeAttr:$eltTypeA, WGMMA_EltTypeAttr:$eltTypeB,
WGMMA_LayoutAttr:$layoutA, WGMMA_LayoutAttr:$layoutB);
let results = (outs LLVM_AnyStruct:$res);
let assemblyFormat = "$opA `,` $opB `,` $opC attr-dict `:` functional-type(operands, $res)";
let assemblyFormat = "$opA `,` $opB (`,` $opC^)? attr-dict `:` functional-type(operands, $res)";
}
def NVGPU_CGABarrierSyncOp : NVGPU_Op<"cga_barrier_sync", []> {

View File

@@ -1,12 +1,16 @@
set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
set(LLVM_TARGET_DEFINITIONS TritonOps.td)
mlir_tablegen(Ops.h.inc -gen-op-decls)
mlir_tablegen(Ops.cpp.inc -gen-op-defs)
mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
add_mlir_doc(TritonOps TritonOps dialects/ -gen-op-doc)
set(LLVM_TARGET_DEFINITIONS TritonDialect.td)
mlir_tablegen(Dialect.h.inc -gen-dialect-decls)
mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs)
add_mlir_doc(TritonDialect TritonDialect dialects/ -gen-dialect-doc)
set(LLVM_TARGET_DEFINITIONS TritonTypes.td)
mlir_tablegen(Types.h.inc -gen-typedef-decls)

View File

@@ -394,7 +394,12 @@ def TT_DotOp : TT_Op<"dot", [Pure,
$d = matrix_multiply($a, $b) + $c
}];
let arguments = (ins TT_FpIntTensor:$a, TT_FpIntTensor:$b, TT_FpIntTensor:$c, BoolAttr:$allowTF32);
let arguments = (ins
TT_FpIntTensor:$a,
TT_FpIntTensor:$b,
TT_FpIntTensor:$c,
BoolAttr:$allowTF32,
I32Attr:$maxNumImpreciseAcc);
let results = (outs TT_FpIntTensor:$d);

View File

@@ -1,3 +1,5 @@
set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
set(LLVM_TARGET_DEFINITIONS TritonGPUOps.td)
mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_gpu)
mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_gpu)
@@ -5,6 +7,8 @@ mlir_tablegen(Ops.h.inc -gen-op-decls)
mlir_tablegen(Ops.cpp.inc -gen-op-defs)
mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=triton_gpu)
mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=triton_gpu)
add_mlir_doc(TritonGPUDialect TritonGPUDialect dialects/ -gen-dialect-doc)
add_mlir_doc(TritonGPUOps TritonGPUOps dialects/ -gen-op-doc)
add_public_tablegen_target(TritonGPUTableGen)
set(LLVM_TARGET_DEFINITIONS TritonGPUAttrDefs.td)

View File

@@ -7,7 +7,6 @@
#include "mlir/IR/Dialect.h"
// TritonGPU depends on Triton
#include "triton/Dialect/NVGPU/IR/Dialect.h"
#include "triton/Dialect/Triton/IR/Dialect.h"
#include "triton/Dialect/TritonGPU/IR/Attributes.h"
#include "triton/Dialect/TritonGPU/IR/Dialect.h.inc"

View File

@@ -113,6 +113,7 @@ compared to 1*64 when the hasLeadingOffset is false.
"ArrayRef<unsigned>":$order,
"CTALayoutAttr":$CTALayout,
"unsigned":$typeWidthInBit), [{
<<<<<<< HEAD
#ifdef USE_ROCM
// ---- begin GFX908/GFX90A ----
@@ -155,6 +156,18 @@ compared to 1*64 when the hasLeadingOffset is false.
}
}
#endif
=======
bool needTrans = false; // default value
return get(context, dotOpEnc, shape, order, CTALayout, typeWidthInBit, needTrans);
}]>,
AttrBuilder<(ins "DotOperandEncodingAttr":$dotOpEnc,
"ArrayRef<int64_t>":$shape,
"ArrayRef<unsigned>":$order,
"CTALayoutAttr":$CTALayout,
"unsigned":$typeWidthInBit,
"bool":$needTrans), [{
>>>>>>> ac9fa68d18c777e421bd3f6fb1ddcfd60b6fda33
auto mmaEnc = dotOpEnc.getParent().dyn_cast<MmaEncodingAttr>();
if(!mmaEnc)
@@ -194,16 +207,23 @@ compared to 1*64 when the hasLeadingOffset is false.
// --- handle A operand ---
if (opIdx == 0) { // compute swizzling for A operand
int vec = (order[0] == 1) ? matShape[2] : matShape[0]; // k : m
int mmaStride = (order[0] == 1) ? matShape[0] : matShape[2];
int m = (needTrans) ? matShape[2] : matShape[0];
int k = (needTrans) ? matShape[0] : matShape[2];
int vec = (order[0] == 1) ? k : m;
int mmaStride = (order[0] == 1) ? m : k;
int maxPhase = mmaStride / perPhase;
return get(context, vec, perPhase, maxPhase, order, CTALayout);
}
// --- handle B operand ---
if (opIdx == 1) {
int vec = (order[0] == 1) ? matShape[1] : matShape[2]; // n : k
int mmaStride = (order[0] == 1) ? matShape[2] : matShape[1];
// we compute vec and maxPhase m, n and k size of the mma
// instruction. when matmul operands is transposed, we should
// consider that to get m, n and k.
int n = needTrans ? matShape[2] : matShape[1];
int k = needTrans ? matShape[1] : matShape[2];
int vec = (order[0] == 1) ? n : k;
int mmaStride = (order[0] == 1) ? k : n;
int maxPhase = mmaStride / perPhase;
return get(context, vec, perPhase, maxPhase, order, CTALayout);
}
@@ -231,6 +251,16 @@ compared to 1*64 when the hasLeadingOffset is false.
return get(context, dotOpEnc, shape, order, CTALayout, bitwidth);
}]>,
AttrBuilder<(ins "DotOperandEncodingAttr":$dotOpEnc,
"ArrayRef<int64_t>":$shape,
"ArrayRef<unsigned>":$order,
"CTALayoutAttr":$CTALayout,
"Type":$eltTy,
"bool":$needTrans), [{
unsigned bitwidth = eltTy.getIntOrFloatBitWidth();
return get(context, dotOpEnc, shape, order, CTALayout, bitwidth, needTrans);
}]>,
AttrBuilder<(ins "ArrayRef<int64_t>":$shape,
"ArrayRef<unsigned>":$order,
"CTALayoutAttr":$CTALayout,

View File

@@ -16,7 +16,6 @@ def TritonGPU_Dialect : Dialect {
let dependentDialects = [
"triton::TritonDialect",
"mlir::triton::nvgpu::NVGPUDialect",
"mlir::gpu::GPUDialect",
"tensor::TensorDialect",
];

View File

@@ -28,7 +28,7 @@ def TTG_ConvertLayoutOp : TTG_Op<"convert_layout",
let results = (outs TT_Tensor:$result);
let hasCanonicalizeMethod = 1;
let hasCanonicalizer = 1;
let assemblyFormat = "$src attr-dict `:` functional-type(operands, results)";
}

View File

@@ -141,13 +141,6 @@ Value linearize(OpBuilder &b, Location loc, ArrayRef<Value> multiDim,
Value linearize(OpBuilder &b, Location loc, ArrayRef<Value> multiDim,
ArrayRef<unsigned> shape);
// Returns null if the op is not inside a agent region (warp specialization
// mode). Note that there should be at most one agent id attached to the
// operation.
std::optional<int> getWSAgentId(Operation *op);
std::optional<int> getWSRoleId(Operation *op);
void setRoleId(Operation *op, int roleId);
} // namespace mlir
#endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_

View File

@@ -1,3 +1,5 @@
set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUOps.td)
mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_nvidia_gpu)
mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_nvidia_gpu)
@@ -5,6 +7,8 @@ mlir_tablegen(Ops.h.inc -gen-op-decls)
mlir_tablegen(Ops.cpp.inc -gen-op-defs)
mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=triton_nvidia_gpu)
mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=triton_nvidia_gpu)
add_mlir_doc(TritonNvidiaGPUDialect TritonNvidiaGPUDialect dialects/ -gen-dialect-doc)
add_mlir_doc(TritonNvidiaGPUOps TritonNvidiaGPUOps dialects/ -gen-op-doc)
add_public_tablegen_target(TritonNvidiaGPUTableGen)
set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUAttrDefs.td)

View File

@@ -30,7 +30,6 @@
#include "mlir/IR/Dialect.h"
// TritonNvidiaGPU depends on Triton
#include "triton/Dialect/NVGPU/IR/Dialect.h"
#include "triton/Dialect/Triton/IR/Dialect.h"
#include "triton/Dialect/TritonGPU/IR/Traits.h"
#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h.inc"
@@ -43,4 +42,15 @@
#define GET_OP_CLASSES
#include "triton/Dialect/TritonNvidiaGPU/IR/Ops.h.inc"
namespace mlir {
// Returns null if the op is not inside a agent region (warp specialization
// mode). Note that there should be at most one agent id attached to the
// operation.
std::optional<int> getWSAgentId(Operation *op);
std::optional<int> getWSRoleId(Operation *op);
void setRoleId(Operation *op, int roleId);
} // namespace mlir
#endif // TRITON_DIALECT_TRITONNVIDIAGPU_IR_DIALECT_H_

View File

@@ -38,7 +38,6 @@ def TritonNvidiaGPU_Dialect : Dialect {
let dependentDialects = [
"triton::TritonDialect",
"triton::gpu::TritonGPUDialect",
"mlir::triton::nvgpu::NVGPUDialect",
"mlir::gpu::GPUDialect",
"tensor::TensorDialect",
];

View File

@@ -258,7 +258,11 @@ def TTNG_DotAsyncOp : TTNG_Op<"dot_async", [Pure,
$d = matrix_multiply($a, $b) + $c
}];
let arguments = (ins TT_FpIntTensor:$a, TT_FpIntTensor:$b, TT_FpIntTensor:$c, BoolAttr:$allowTF32);
let arguments = (ins TT_FpIntTensor:$a,
TT_FpIntTensor:$b,
TT_FpIntTensor:$c,
BoolAttr:$allowTF32,
I32Attr:$maxNumImpreciseAcc);
let results = (outs TT_FpIntTensor:$d);

View File

@@ -30,7 +30,7 @@
namespace triton {
const std::set<std::string> ENV_VARS = {
"ENABLE_MMA_V3", "TRITON_DISABLE_LINE_INFO", "DISABLE_FAST_REDUCTION",
"DISABLE_MMA_V3", "TRITON_DISABLE_LINE_INFO", "DISABLE_FAST_REDUCTION",
"ENABLE_TMA", "MLIR_ENABLE_DUMP", "LLVM_IR_ENABLE_DUMP",
"AMDGCN_ENABLE_DUMP"};