From e2e6df322efe9029f5c8b9b34ca680c3b0c6ab11 Mon Sep 17 00:00:00 2001 From: Andi Drebes Date: Wed, 7 Dec 2022 10:26:29 +0100 Subject: [PATCH] feat(compiler): Add support for full unrolling of loops with SDFG-convertible ops This adds a new option `--unroll-loops-with-sdfg-convertible-ops`, which causes loops containing SDFG-convertible operations to be fully unrolled upon the extraction of SDFG-operations using the `--emit-sdfg-ops` switch. This avoids constant roundtrips between an SDFG-capable accelerator and the host during execution of a loop. The option is limited to `scf.for` loops with static bounds and a static step size. Since full unrolling of loops with large bounds results in a large number of operations, the option is disabled by default. --- .../Conversion/ExtractSDFGOps/Pass.h | 3 +- .../concretelang/Support/CompilerEngine.h | 6 +- .../include/concretelang/Support/Pipeline.h | 7 +- .../Conversion/ExtractSDFGOps/CMakeLists.txt | 1 + .../ExtractSDFGOps/ExtractSDFGOps.cpp | 60 +++++++++++++++- compiler/lib/Support/CompilerEngine.cpp | 5 +- compiler/lib/Support/Pipeline.cpp | 15 ++-- compiler/src/main.cpp | 8 +++ .../check_tests/Dialect/SDFG/unrolling.mlir | 72 +++++++++++++++++++ 9 files changed, 160 insertions(+), 17 deletions(-) create mode 100644 compiler/tests/check_tests/Dialect/SDFG/unrolling.mlir diff --git a/compiler/include/concretelang/Conversion/ExtractSDFGOps/Pass.h b/compiler/include/concretelang/Conversion/ExtractSDFGOps/Pass.h index ec61eed94..08fe7e4a7 100644 --- a/compiler/include/concretelang/Conversion/ExtractSDFGOps/Pass.h +++ b/compiler/include/concretelang/Conversion/ExtractSDFGOps/Pass.h @@ -11,7 +11,8 @@ namespace mlir { namespace concretelang { -std::unique_ptr> createExtractSDFGOpsPass(); +std::unique_ptr> +createExtractSDFGOpsPass(bool unroll); } // namespace concretelang } // namespace mlir diff --git a/compiler/include/concretelang/Support/CompilerEngine.h b/compiler/include/concretelang/Support/CompilerEngine.h index c8d0eda9a..fe5da4756 100644 --- a/compiler/include/concretelang/Support/CompilerEngine.h +++ b/compiler/include/concretelang/Support/CompilerEngine.h @@ -60,6 +60,7 @@ struct CompilationOptions { bool loopParallelize; bool batchConcreteOps; bool emitSDFGOps; + bool unrollLoopsWithSDFGConvertibleOps; bool dataflowParallelize; bool optimizeConcrete; /// use GPU during execution by generating GPU operations if possible @@ -73,8 +74,9 @@ struct CompilationOptions { CompilationOptions() : v0FHEConstraints(llvm::None), verifyDiagnostics(false), autoParallelize(false), loopParallelize(false), batchConcreteOps(false), - emitSDFGOps(false), dataflowParallelize(false), optimizeConcrete(true), - emitGPUOps(false), clientParametersFuncName(llvm::None), + emitSDFGOps(false), unrollLoopsWithSDFGConvertibleOps(false), + dataflowParallelize(false), optimizeConcrete(true), emitGPUOps(false), + clientParametersFuncName(llvm::None), optimizerConfig(optimizer::DEFAULT_CONFIG){}; CompilationOptions(std::string funcname) : CompilationOptions() { diff --git a/compiler/include/concretelang/Support/Pipeline.h b/compiler/include/concretelang/Support/Pipeline.h index 063fff10e..68aa73e1b 100644 --- a/compiler/include/concretelang/Support/Pipeline.h +++ b/compiler/include/concretelang/Support/Pipeline.h @@ -58,9 +58,10 @@ mlir::LogicalResult optimizeConcrete(mlir::MLIRContext &context, mlir::ModuleOp &module, std::function enablePass); -mlir::LogicalResult -extractSDFGOps(mlir::MLIRContext &context, mlir::ModuleOp &module, - std::function enablePass); +mlir::LogicalResult extractSDFGOps(mlir::MLIRContext &context, + mlir::ModuleOp &module, + std::function enablePass, + bool unrollLoops); mlir::LogicalResult lowerBConcreteToStd(mlir::MLIRContext &context, mlir::ModuleOp &module, diff --git a/compiler/lib/Conversion/ExtractSDFGOps/CMakeLists.txt b/compiler/lib/Conversion/ExtractSDFGOps/CMakeLists.txt index fce834f6a..90d965db1 100644 --- a/compiler/lib/Conversion/ExtractSDFGOps/CMakeLists.txt +++ b/compiler/lib/Conversion/ExtractSDFGOps/CMakeLists.txt @@ -12,6 +12,7 @@ add_mlir_dialect_library( ConcretelangSDFGInterfaces PUBLIC MLIRIR + MLIRSCFUtils MLIRTransforms) target_link_libraries(ExtractSDFGOps PUBLIC MLIRIR) diff --git a/compiler/lib/Conversion/ExtractSDFGOps/ExtractSDFGOps.cpp b/compiler/lib/Conversion/ExtractSDFGOps/ExtractSDFGOps.cpp index 817adb622..823a0ae06 100644 --- a/compiler/lib/Conversion/ExtractSDFGOps/ExtractSDFGOps.cpp +++ b/compiler/lib/Conversion/ExtractSDFGOps/ExtractSDFGOps.cpp @@ -9,6 +9,7 @@ #include "concretelang/Dialect/SDFG/IR/SDFGTypes.h" #include "concretelang/Dialect/SDFG/Interfaces/SDFGConvertibleInterface.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/SCF/Utils/Utils.h" #include "mlir/IR/ImplicitLocOpBuilder.h" #include "mlir/IR/Visitors.h" #include "mlir/Pass/Pass.h" @@ -18,6 +19,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Casting.h" +#include +#include namespace SDFG = mlir::concretelang::SDFG; @@ -34,6 +37,51 @@ SDFG::MakeStream makeStream(mlir::ImplicitLocOpBuilder &builder, return builder.create(streamType, dfg, name, kind); } +/// Unrolls entirely all scf loops, which directly contain an +/// SDFG-convertible operation and whose bounds are static. +void unrollLoopsWithSDFGConvertibleOps(mlir::func::FuncOp func) { + mlir::DenseSet unrollCandidates; + + // Identify loops with SDFG-convertible ops + func.walk([&](SDFG::SDFGConvertibleOpInterface convertible) { + for (mlir::Operation *parent = convertible->getParentOp(); parent; + parent = parent->getParentOp()) { + if (mlir::scf::ForOp forOp = llvm::dyn_cast(parent)) { + unrollCandidates.insert(forOp); + } + } + }); + + // Fully unroll all the loops if its bounds are static + for (mlir::scf::ForOp forOp : unrollCandidates) { + mlir::arith::ConstantIndexOp lb = + forOp.getLowerBound().getDefiningOp(); + mlir::arith::ConstantIndexOp ub = + forOp.getUpperBound().getDefiningOp(); + mlir::arith::ConstantIndexOp step = + forOp.getStep().getDefiningOp(); + + if (!lb || !ub || !step) + continue; + + int64_t ilb = lb.value(); + int64_t iub = ub.value(); + int64_t istep = step.value(); + + // Unrolling requires positive bounds and step + if (ilb < 0 || iub < 0 || istep <= 0) + continue; + + int64_t unrollFactor = ((iub - ilb) + (istep - 1)) / istep; + + if (unrollFactor == 0) + continue; + + if (mlir::loopUnrollByFactor(forOp, (uint64_t)unrollFactor).failed()) + continue; + } +} + StreamMappingKind determineStreamMappingKind(mlir::Value v) { // Determine stream type for operands: // @@ -90,11 +138,16 @@ void setInsertionPointAfterValueOrRestore(mlir::OpBuilder &builder, } struct ExtractSDFGOpsPass : public ExtractSDFGOpsBase { + bool unroll; - ExtractSDFGOpsPass() {} + ExtractSDFGOpsPass(bool unroll) : unroll(unroll) {} void runOnOperation() override { mlir::func::FuncOp func = getOperation(); + + if (unroll) + unrollLoopsWithSDFGConvertibleOps(func); + mlir::IRRewriter rewriter(func.getContext()); mlir::DenseMap processOutMapping; @@ -205,8 +258,9 @@ struct ExtractSDFGOpsPass : public ExtractSDFGOpsBase { namespace mlir { namespace concretelang { -std::unique_ptr> createExtractSDFGOpsPass() { - return std::make_unique(); +std::unique_ptr> +createExtractSDFGOpsPass(bool unroll) { + return std::make_unique(unroll); } } // namespace concretelang } // namespace mlir diff --git a/compiler/lib/Support/CompilerEngine.cpp b/compiler/lib/Support/CompilerEngine.cpp index 5165c3e34..1ba0ed4fa 100644 --- a/compiler/lib/Support/CompilerEngine.cpp +++ b/compiler/lib/Support/CompilerEngine.cpp @@ -399,8 +399,9 @@ CompilerEngine::compile(llvm::SourceMgr &sm, Target target, OptionalLib lib) { // Extract SDFG data flow graph from BConcrete representation if (options.emitSDFGOps) { - if (mlir::concretelang::pipeline::extractSDFGOps(mlirContext, module, - enablePass) + if (mlir::concretelang::pipeline::extractSDFGOps( + mlirContext, module, enablePass, + options.unrollLoopsWithSDFGConvertibleOps) .failed()) { return errorDiag( "Extraction of SDFG operations from BConcrete representation failed"); diff --git a/compiler/lib/Support/Pipeline.cpp b/compiler/lib/Support/Pipeline.cpp index 67421c482..bf7c13112 100644 --- a/compiler/lib/Support/Pipeline.cpp +++ b/compiler/lib/Support/Pipeline.cpp @@ -274,14 +274,17 @@ lowerConcreteToBConcrete(mlir::MLIRContext &context, mlir::ModuleOp &module, return pm.run(module.getOperation()); } -mlir::LogicalResult -extractSDFGOps(mlir::MLIRContext &context, mlir::ModuleOp &module, - std::function enablePass) { +mlir::LogicalResult extractSDFGOps(mlir::MLIRContext &context, + mlir::ModuleOp &module, + std::function enablePass, + bool unroll) { mlir::PassManager pm(&context); pipelinePrinting("extract SDFG ops from BConcrete", pm, context); - addPotentiallyNestedPass(pm, mlir::concretelang::createExtractSDFGOpsPass(), - enablePass); - return pm.run(module.getOperation()); + addPotentiallyNestedPass( + pm, mlir::concretelang::createExtractSDFGOpsPass(unroll), enablePass); + LogicalResult res = pm.run(module.getOperation()); + + return res; } mlir::LogicalResult diff --git a/compiler/src/main.cpp b/compiler/src/main.cpp index 5b4919db3..11d29b554 100644 --- a/compiler/src/main.cpp +++ b/compiler/src/main.cpp @@ -183,6 +183,12 @@ llvm::cl::opt emitSDFGOps( " graphs and emit them."), llvm::cl::init(false)); +llvm::cl::opt unrollLoopsWithSDFGConvertibleOps( + "unroll-loops-with-sdfg-convertible-ops", + llvm::cl::desc("Causes loops containing SDFG-convertible operations to be " + "fully unrolled."), + llvm::cl::init(false)); + llvm::cl::opt dataflowParallelize( "parallelize-dataflow", llvm::cl::desc( @@ -316,6 +322,8 @@ cmdlineCompilationOptions() { options.dataflowParallelize = cmdline::dataflowParallelize; options.batchConcreteOps = cmdline::batchConcreteOps; options.emitSDFGOps = cmdline::emitSDFGOps; + options.unrollLoopsWithSDFGConvertibleOps = + cmdline::unrollLoopsWithSDFGConvertibleOps; options.optimizeConcrete = cmdline::optimizeConcrete; options.emitGPUOps = cmdline::emitGPUOps; diff --git a/compiler/tests/check_tests/Dialect/SDFG/unrolling.mlir b/compiler/tests/check_tests/Dialect/SDFG/unrolling.mlir new file mode 100644 index 000000000..e1046a1c8 --- /dev/null +++ b/compiler/tests/check_tests/Dialect/SDFG/unrolling.mlir @@ -0,0 +1,72 @@ +// RUN: concretecompiler --action=dump-sdfg --emit-sdfg-ops --unroll-loops-with-sdfg-convertible-ops --split-input-file %s 2>&1| FileCheck %s + +// CHECK: func.func @main(%[[Varg0:.*]]: tensor<4x513xi64>, %[[Varg1:.*]]: tensor<4x513xi64>) -> tensor<4x513xi64> { +// CHECK-NEXT: %[[V0:.*]] = "SDFG.init"() : () -> !SDFG.dfg +// CHECK-NEXT: %[[V1:.*]] = "SDFG.make_stream"(%[[V0]]) {name = "stream0", type = #SDFG.stream_kind} : (!SDFG.dfg) -> !SDFG.stream> +// CHECK-NEXT: %[[V2:.*]] = "SDFG.make_stream"(%[[V0]]) {name = "stream1", type = #SDFG.stream_kind} : (!SDFG.dfg) -> !SDFG.stream> +// CHECK-NEXT: %[[V3:.*]] = "SDFG.make_stream"(%[[V0]]) {name = "stream2", type = #SDFG.stream_kind} : (!SDFG.dfg) -> !SDFG.stream> +// CHECK-NEXT: "SDFG.make_process"(%[[V0]], %[[V2]], %[[V3]], %[[V1]]) {type = #SDFG.process_kind} : (!SDFG.dfg, !SDFG.stream>, !SDFG.stream>, !SDFG.stream>) -> () +// CHECK-NEXT: %[[V4:.*]] = "SDFG.make_stream"(%[[V0]]) {name = "stream3", type = #SDFG.stream_kind} : (!SDFG.dfg) -> !SDFG.stream> +// CHECK-NEXT: %[[V5:.*]] = "SDFG.make_stream"(%[[V0]]) {name = "stream4", type = #SDFG.stream_kind} : (!SDFG.dfg) -> !SDFG.stream> +// CHECK-NEXT: %[[V6:.*]] = "SDFG.make_stream"(%[[V0]]) {name = "stream5", type = #SDFG.stream_kind} : (!SDFG.dfg) -> !SDFG.stream> +// CHECK-NEXT: "SDFG.make_process"(%[[V0]], %[[V5]], %[[V6]], %[[V4]]) {type = #SDFG.process_kind} : (!SDFG.dfg, !SDFG.stream>, !SDFG.stream>, !SDFG.stream>) -> () +// CHECK-NEXT: %[[V7:.*]] = "SDFG.make_stream"(%[[V0]]) {name = "stream6", type = #SDFG.stream_kind} : (!SDFG.dfg) -> !SDFG.stream> +// CHECK-NEXT: %[[V8:.*]] = "SDFG.make_stream"(%[[V0]]) {name = "stream7", type = #SDFG.stream_kind} : (!SDFG.dfg) -> !SDFG.stream> +// CHECK-NEXT: %[[V9:.*]] = "SDFG.make_stream"(%[[V0]]) {name = "stream8", type = #SDFG.stream_kind} : (!SDFG.dfg) -> !SDFG.stream> +// CHECK-NEXT: "SDFG.make_process"(%[[V0]], %[[V8]], %[[V9]], %[[V7]]) {type = #SDFG.process_kind} : (!SDFG.dfg, !SDFG.stream>, !SDFG.stream>, !SDFG.stream>) -> () +// CHECK-NEXT: %[[V10:.*]] = "SDFG.make_stream"(%[[V0]]) {name = "stream9", type = #SDFG.stream_kind} : (!SDFG.dfg) -> !SDFG.stream> +// CHECK-NEXT: %[[V11:.*]] = "SDFG.make_stream"(%[[V0]]) {name = "stream10", type = #SDFG.stream_kind} : (!SDFG.dfg) -> !SDFG.stream> +// CHECK-NEXT: %[[V12:.*]] = "SDFG.make_stream"(%[[V0]]) {name = "stream11", type = #SDFG.stream_kind} : (!SDFG.dfg) -> !SDFG.stream> +// CHECK-NEXT: "SDFG.make_process"(%[[V0]], %[[V11]], %[[V12]], %[[V10]]) {type = #SDFG.process_kind} : (!SDFG.dfg, !SDFG.stream>, !SDFG.stream>, !SDFG.stream>) -> () +// CHECK-NEXT: "SDFG.start"(%[[V0]]) : (!SDFG.dfg) -> () +// CHECK-NEXT: %[[Vc0:.*]] = arith.constant 0 : index +// CHECK-NEXT: %[[Vc1:.*]] = arith.constant 1 : index +// CHECK-NEXT: %[[V13:.*]] = bufferization.alloc_tensor() : tensor<4x513xi64> +// CHECK-NEXT: %[[V14:.*]] = tensor.extract_slice %[[Varg0]]{{\[}}%[[Vc0]], 0{{\] \[1, 513\] \[1, 1\]}} : tensor<4x513xi64> to tensor<1x513xi64> +// CHECK-NEXT: %[[V15:.*]] = tensor.collapse_shape %[[V14]] {{\[\[0, 1\]\]}} : tensor<1x513xi64> into tensor<513xi64> +// CHECK-NEXT: "SDFG.put"(%[[V2]], %[[V15]]) : (!SDFG.stream>, tensor<513xi64>) -> () +// CHECK-NEXT: %[[V16:.*]] = tensor.extract_slice %[[Varg1]]{{\[}}%[[Vc0]], 0{{\] \[1, 513\] \[1, 1\]}} : tensor<4x513xi64> to tensor<1x513xi64> +// CHECK-NEXT: %[[V17:.*]] = tensor.collapse_shape %[[V16]] {{\[\[0, 1\]\]}} : tensor<1x513xi64> into tensor<513xi64> +// CHECK-NEXT: "SDFG.put"(%[[V3]], %[[V17]]) : (!SDFG.stream>, tensor<513xi64>) -> () +// CHECK-NEXT: %[[V18:.*]] = "SDFG.get"(%[[V1]]) : (!SDFG.stream>) -> tensor<513xi64> +// CHECK-NEXT: %[[V19:.*]] = tensor.insert_slice %[[V18]] into %[[V13]]{{\[}}%[[Vc0]], 0{{\] \[1, 513\] \[1, 1\]}} : tensor<513xi64> into tensor<4x513xi64> +// CHECK-NEXT: %[[Vc1_0:.*]] = arith.constant 1 : index +// CHECK-NEXT: %[[V20:.*]] = arith.muli %[[Vc1]], %[[Vc1_0]] : index +// CHECK-NEXT: %[[V21:.*]] = arith.addi %[[Vc0]], %[[V20]] : index +// CHECK-NEXT: %[[V22:.*]] = tensor.extract_slice %[[Varg0]]{{\[}}%[[V21]], 0{{\] \[1, 513\] \[1, 1\]}} : tensor<4x513xi64> to tensor<1x513xi64> +// CHECK-NEXT: %[[V23:.*]] = tensor.collapse_shape %[[V22]] {{\[\[0, 1\]\]}} : tensor<1x513xi64> into tensor<513xi64> +// CHECK-NEXT: "SDFG.put"(%[[V5]], %[[V23]]) : (!SDFG.stream>, tensor<513xi64>) -> () +// CHECK-NEXT: %[[V24:.*]] = tensor.extract_slice %[[Varg1]]{{\[}}%[[V21]], 0{{\] \[1, 513\] \[1, 1\]}} : tensor<4x513xi64> to tensor<1x513xi64> +// CHECK-NEXT: %[[V25:.*]] = tensor.collapse_shape %[[V24]] {{\[\[0, 1\]\]}} : tensor<1x513xi64> into tensor<513xi64> +// CHECK-NEXT: "SDFG.put"(%[[V6]], %[[V25]]) : (!SDFG.stream>, tensor<513xi64>) -> () +// CHECK-NEXT: %[[V26:.*]] = "SDFG.get"(%[[V4]]) : (!SDFG.stream>) -> tensor<513xi64> +// CHECK-NEXT: %[[V27:.*]] = tensor.insert_slice %[[V26]] into %[[V19]]{{\[}}%[[V21]], 0{{\] \[1, 513\] \[1, 1\]}} : tensor<513xi64> into tensor<4x513xi64> +// CHECK-NEXT: %[[Vc2:.*]] = arith.constant 2 : index +// CHECK-NEXT: %[[V28:.*]] = arith.muli %[[Vc1]], %[[Vc2]] : index +// CHECK-NEXT: %[[V29:.*]] = arith.addi %[[Vc0]], %[[V28]] : index +// CHECK-NEXT: %[[V30:.*]] = tensor.extract_slice %[[Varg0]]{{\[}}%[[V29]], 0{{\] \[1, 513\] \[1, 1\]}} : tensor<4x513xi64> to tensor<1x513xi64> +// CHECK-NEXT: %[[V31:.*]] = tensor.collapse_shape %[[V30]] {{\[\[0, 1\]\]}} : tensor<1x513xi64> into tensor<513xi64> +// CHECK-NEXT: "SDFG.put"(%[[V8]], %[[V31]]) : (!SDFG.stream>, tensor<513xi64>) -> () +// CHECK-NEXT: %[[V32:.*]] = tensor.extract_slice %[[Varg1]]{{\[}}%[[V29]], 0{{\] \[1, 513\] \[1, 1\]}} : tensor<4x513xi64> to tensor<1x513xi64> +// CHECK-NEXT: %[[V33:.*]] = tensor.collapse_shape %[[V32]] {{\[\[0, 1\]\]}} : tensor<1x513xi64> into tensor<513xi64> +// CHECK-NEXT: "SDFG.put"(%[[V9]], %[[V33]]) : (!SDFG.stream>, tensor<513xi64>) -> () +// CHECK-NEXT: %[[V34:.*]] = "SDFG.get"(%[[V7]]) : (!SDFG.stream>) -> tensor<513xi64> +// CHECK-NEXT: %[[V35:.*]] = tensor.insert_slice %[[V34]] into %[[V27]]{{\[}}%[[V29]], 0{{\] \[1, 513\] \[1, 1\]}} : tensor<513xi64> into tensor<4x513xi64> +// CHECK-NEXT: %[[Vc3:.*]] = arith.constant 3 : index +// CHECK-NEXT: %[[V36:.*]] = arith.muli %[[Vc1]], %[[Vc3]] : index +// CHECK-NEXT: %[[V37:.*]] = arith.addi %[[Vc0]], %[[V36]] : index +// CHECK-NEXT: %[[V38:.*]] = tensor.extract_slice %[[Varg0]]{{\[}}%[[V37]], 0{{\] \[1, 513\] \[1, 1\]}} : tensor<4x513xi64> to tensor<1x513xi64> +// CHECK-NEXT: %[[V39:.*]] = tensor.collapse_shape %[[V38]] {{\[\[0, 1\]\]}} : tensor<1x513xi64> into tensor<513xi64> +// CHECK-NEXT: "SDFG.put"(%[[V11]], %[[V39]]) : (!SDFG.stream>, tensor<513xi64>) -> () +// CHECK-NEXT: %[[V40:.*]] = tensor.extract_slice %[[Varg1]]{{\[}}%[[V37]], 0{{\] \[1, 513\] \[1, 1\]}} : tensor<4x513xi64> to tensor<1x513xi64> +// CHECK-NEXT: %[[V41:.*]] = tensor.collapse_shape %[[V40]] {{\[\[0, 1\]\]}} : tensor<1x513xi64> into tensor<513xi64> +// CHECK-NEXT: "SDFG.put"(%[[V12]], %[[V41]]) : (!SDFG.stream>, tensor<513xi64>) -> () +// CHECK-NEXT: %[[V42:.*]] = "SDFG.get"(%[[V10]]) : (!SDFG.stream>) -> tensor<513xi64> +// CHECK-NEXT: %[[V43:.*]] = tensor.insert_slice %[[V42]] into %[[V35]]{{\[}}%[[V37]], 0{{\] \[1, 513\] \[1, 1\]}} : tensor<513xi64> into tensor<4x513xi64> +// CHECK-NEXT: "SDFG.shutdown"(%[[V0]]) : (!SDFG.dfg) -> () +// CHECK-NEXT: return %[[V43]] : tensor<4x513xi64> +// CHECK-NEXT: } +func.func @main(%a0: tensor<4x!FHE.eint<6>>, %a1: tensor<4x!FHE.eint<6>>) -> tensor<4x!FHE.eint<6>> { + %res = "FHELinalg.add_eint"(%a0, %a1) : (tensor<4x!FHE.eint<6>>, tensor<4x!FHE.eint<6>>) -> tensor<4x!FHE.eint<6>> + return %res : tensor<4x!FHE.eint<6>> +}