mirror of
https://github.com/ROCm/ROCm.git
synced 2026-04-05 03:01:17 -04:00
Merge commit '5df904233c11a65bd131ead7268f84cca7804275' into ifu230810-2
Conflicts: include/triton/Dialect/Triton/Transforms/Passes.h include/triton/Dialect/TritonGPU/IR/Dialect.h include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td lib/Analysis/Allocation.cpp lib/Analysis/Utility.cpp lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp lib/Dialect/Triton/Transforms/RewriteTensorPointer.cpp lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp lib/Dialect/TritonGPU/Transforms/ReorderInstructions.cpp lib/Target/LLVMIR/LLVMIRTranslation.cpp python/src/triton.cc python/triton/compiler/compiler.py python/triton/ops/flash_attention.py python/triton/runtime/autotuner.py python/triton/runtime/jit.py python/triton/tools/aot.py python/tutorials/06-fused-attention.py test/Conversion/tritongpu_to_llvm.mlir test/Target/tritongpu_to_llvmir.mlir test/Target/tritongpu_to_llvmir_noinline.mlir
This commit is contained in:
@@ -183,6 +183,13 @@ private:
|
||||
ReduceOpHelper helper(reduceOp);
|
||||
unsigned bytes = helper.getScratchSizeInBytes();
|
||||
maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes);
|
||||
<<<<<<< HEAD
|
||||
=======
|
||||
} else if (auto scanOp = dyn_cast<triton::ScanOp>(op)) {
|
||||
ScanLoweringHelper helper(scanOp);
|
||||
unsigned bytes = helper.getScratchSizeInBytes();
|
||||
maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes);
|
||||
>>>>>>> 5df904233c11a65bd131ead7268f84cca7804275
|
||||
} else if (auto cvtLayout = dyn_cast<triton::gpu::ConvertLayoutOp>(op)) {
|
||||
auto srcTy = cvtLayout.getSrc().getType().cast<RankedTensorType>();
|
||||
auto dstTy = cvtLayout.getResult().getType().cast<RankedTensorType>();
|
||||
@@ -393,10 +400,19 @@ private:
|
||||
DenseMap<BufferT *, size_t> bufferStart;
|
||||
calculateStarts(buffers, bufferStart);
|
||||
|
||||
// NOTE: The original paper doesn't consider interference between
|
||||
// the bumped ranges. Buffers that previously do not interfere with
|
||||
// could interfere after offset bumping if their liveness ranges overlap.
|
||||
// Therefore, we rerun the interference graph algorithm after bumping so
|
||||
// that we regroup the buffers and color them again. Since we always
|
||||
// increase the buffer offset and keep reducing conflicts, we will
|
||||
// eventually reach a fixed point.
|
||||
GraphT interference;
|
||||
buildInterferenceGraph(buffers, bufferStart, interference);
|
||||
|
||||
allocate(buffers, bufferStart, interference);
|
||||
do {
|
||||
allocate(buffers, interference, bufferStart);
|
||||
buildInterferenceGraph(buffers, bufferStart, interference);
|
||||
} while (!interference.empty());
|
||||
}
|
||||
|
||||
/// Computes the initial shared memory offsets.
|
||||
@@ -462,6 +478,8 @@ private:
|
||||
void buildInterferenceGraph(const SmallVector<BufferT *> &buffers,
|
||||
const DenseMap<BufferT *, size_t> &bufferStart,
|
||||
GraphT &interference) {
|
||||
// Reset interference graph
|
||||
interference.clear();
|
||||
for (auto x : buffers) {
|
||||
for (auto y : buffers) {
|
||||
if (x == y)
|
||||
@@ -484,8 +502,10 @@ private:
|
||||
|
||||
/// Finalizes shared memory offsets considering interference.
|
||||
void allocate(const SmallVector<BufferT *> &buffers,
|
||||
const DenseMap<BufferT *, size_t> &bufferStart,
|
||||
const GraphT &interference) {
|
||||
const GraphT &interference,
|
||||
DenseMap<BufferT *, size_t> &bufferStart) {
|
||||
// Reset shared memory size
|
||||
allocation->sharedMemorySize = 0;
|
||||
// First-fit graph coloring
|
||||
// Neighbors are nodes that interfere with each other.
|
||||
// We color a node by finding the index of the first available
|
||||
@@ -519,6 +539,7 @@ private:
|
||||
adj = std::max(adj, bufferStart.lookup(y) + y->size);
|
||||
}
|
||||
x->offset = bufferStart.lookup(x) + colors.lookup(x) * adj;
|
||||
bufferStart[x] = x->offset;
|
||||
allocation->sharedMemorySize =
|
||||
std::max(allocation->sharedMemorySize, x->offset + x->size);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user