[OPTIMIZER] Thread local reduction optimization (#2542)

Co-authored-by: Phil Tillet <phil@openai.com>
This commit is contained in:
Zahi Moudallal
2023-10-31 16:13:36 -07:00
committed by GitHub
parent 258399c114
commit 3650213218
12 changed files with 986 additions and 31 deletions

View File

@@ -32,6 +32,8 @@ std::unique_ptr<Pass> createTritonGPUOptimizeDotOperandsPass();
std::unique_ptr<Pass> createTritonGPUOptimizeEpiloguePass();
std::unique_ptr<Pass> createTritonGPUOptimizeThreadLocalityPass();
/// Generate the code for registering passes.
#define GEN_PASS_REGISTRATION
#include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"

View File

@@ -124,6 +124,19 @@ def TritonGPUOptimizeEpilogue : Pass<"tritongpu-optimize-epilogue", "mlir::Modul
}
def TritonGPUOptimizeThreadLocality : Pass<"tritongpu-optimize-thread-locality", "mlir::ModuleOp"> {
let summary = "Reduce the cost of synchronization between threads in an SM";
let description = [{
Today, this optimizes reduction yielded by loop to be thread-local until after the loop completes.
}];
let constructor = "mlir::createTritonGPUOptimizeThreadLocalityPass()";
let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
"mlir::triton::TritonDialect"];
}
def TritonGPUReorderInstructions: Pass<"tritongpu-reorder-instructions", "mlir::ModuleOp"> {
let summary = "Reorder instructions";

View File

@@ -111,6 +111,9 @@ bool isExpensiveLoadOrStore(Operation *op);
bool canFoldIntoConversion(Operation *op, Attribute targetEncoding);
scf::ForOp replaceForOpWithNewSignature(OpBuilder &rewriter, scf::ForOp loop,
ValueRange newIterOperands);
Operation *cloneWithInferType(mlir::OpBuilder &rewriter, Operation *op,
IRMapping &mapping);