mirror of
https://github.com/ROCm/ROCm.git
synced 2026-04-05 03:01:17 -04:00
[OPTIMIZER] Thread local reduction optimization (#2542)
Co-authored-by: Phil Tillet <phil@openai.com>
This commit is contained in:
@@ -32,6 +32,8 @@ std::unique_ptr<Pass> createTritonGPUOptimizeDotOperandsPass();
|
||||
|
||||
std::unique_ptr<Pass> createTritonGPUOptimizeEpiloguePass();
|
||||
|
||||
std::unique_ptr<Pass> createTritonGPUOptimizeThreadLocalityPass();
|
||||
|
||||
/// Generate the code for registering passes.
|
||||
#define GEN_PASS_REGISTRATION
|
||||
#include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
|
||||
|
||||
@@ -124,6 +124,19 @@ def TritonGPUOptimizeEpilogue : Pass<"tritongpu-optimize-epilogue", "mlir::Modul
|
||||
|
||||
}
|
||||
|
||||
def TritonGPUOptimizeThreadLocality : Pass<"tritongpu-optimize-thread-locality", "mlir::ModuleOp"> {
|
||||
let summary = "Reduce the cost of synchronization between threads in an SM";
|
||||
|
||||
let description = [{
|
||||
Today, this optimizes reduction yielded by loop to be thread-local until after the loop completes.
|
||||
}];
|
||||
|
||||
let constructor = "mlir::createTritonGPUOptimizeThreadLocalityPass()";
|
||||
|
||||
let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
|
||||
"mlir::triton::TritonDialect"];
|
||||
}
|
||||
|
||||
def TritonGPUReorderInstructions: Pass<"tritongpu-reorder-instructions", "mlir::ModuleOp"> {
|
||||
let summary = "Reorder instructions";
|
||||
|
||||
|
||||
@@ -111,6 +111,9 @@ bool isExpensiveLoadOrStore(Operation *op);
|
||||
|
||||
bool canFoldIntoConversion(Operation *op, Attribute targetEncoding);
|
||||
|
||||
scf::ForOp replaceForOpWithNewSignature(OpBuilder &rewriter, scf::ForOp loop,
|
||||
ValueRange newIterOperands);
|
||||
|
||||
Operation *cloneWithInferType(mlir::OpBuilder &rewriter, Operation *op,
|
||||
IRMapping &mapping);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user