diff --git a/compiler/CMakeLists.txt b/compiler/CMakeLists.txt index 3d5202090..c7de48b93 100644 --- a/compiler/CMakeLists.txt +++ b/compiler/CMakeLists.txt @@ -76,6 +76,7 @@ endif() #------------------------------------------------------------------------------- option(CONCRETELANG_PARALLEL_EXECUTION_ENABLED "Enables parallel execution for ConcreteLang." ON) +option(CONCRETELANG_TIMING_ENABLED "Enables execution timing." ON) if(CONCRETELANG_PARALLEL_EXECUTION_ENABLED) message(STATUS "ConcreteLang parallel execution enabled.") @@ -92,6 +93,14 @@ else() message(STATUS "ConcreteLang parallel execution disabled.") endif() +if(CONCRETELANG_TIMING_ENABLED) + add_compile_options( + -DCONCRETELANG_TIMING_ENABLED + ) +else() + message(STATUS "ConcreteLang execution timing disabled.") +endif() + #------------------------------------------------------------------------------- # Unit tests #------------------------------------------------------------------------------- diff --git a/compiler/Makefile b/compiler/Makefile index 537b2a646..6107f727e 100644 --- a/compiler/Makefile +++ b/compiler/Makefile @@ -2,6 +2,7 @@ BUILD_DIR=./build Python3_EXECUTABLE?= BINDINGS_PYTHON_ENABLED=ON PARALLEL_EXECUTION_ENABLED=OFF +TIMING_ENABLED=OFF CC_COMPILER= CXX_COMPILER= @@ -58,6 +59,7 @@ $(BUILD_DIR)/configured.stamp: -DMLIR_ENABLE_BINDINGS_PYTHON=$(BINDINGS_PYTHON_ENABLED) \ -DCONCRETELANG_BINDINGS_PYTHON_ENABLED=$(BINDINGS_PYTHON_ENABLED) \ -DCONCRETELANG_PARALLEL_EXECUTION_ENABLED=$(PARALLEL_EXECUTION_ENABLED) \ + -DCONCRETELANG_TIMING_ENABLED=$(TIMING_ENABLED) \ -DCONCRETE_FFI_RELEASE=${CONCRETE_PROJECT}/target/release \ -DHPX_DIR=${HPX_INSTALL_DIR}/lib/cmake/HPX \ -DLLVM_EXTERNAL_PROJECTS=concretelang \ diff --git a/compiler/include/concretelang/Runtime/runtime_api.h b/compiler/include/concretelang/Runtime/runtime_api.h index c8ccc2c63..a3ed68a65 100644 --- a/compiler/include/concretelang/Runtime/runtime_api.h +++ b/compiler/include/concretelang/Runtime/runtime_api.h @@ -27,8 +27,8 @@ void _dfr_deallocate_future_data(void *); /* Initialisation & termination. */ void _dfr_start_c(void *); -void _dfr_start(); -void _dfr_stop(); +void _dfr_start(int); +void _dfr_stop(int); void _dfr_terminate(); } diff --git a/compiler/include/concretelang/Runtime/time_util.h b/compiler/include/concretelang/Runtime/time_util.h new file mode 100644 index 000000000..ddb84172a --- /dev/null +++ b/compiler/include/concretelang/Runtime/time_util.h @@ -0,0 +1,95 @@ +// Part of the Concrete Compiler Project, under the BSD3 License with Zama +// Exceptions. See +// https://github.com/zama-ai/concrete-compiler-internal/blob/main/LICENSE.txt +// for license information. + +#ifndef CONCRETELANG_DFR_TIME_UTIL_H +#define CONCRETELANG_DFR_TIME_UTIL_H + +#if CONCRETELANG_TIMING_ENABLED + +#include +#include +#include + +#include "concretelang/Runtime/DFRuntime.hpp" + +#define TIME_UTIL_CLOCK CLOCK_MONOTONIC + +static inline int timespec_diff(struct timespec *, const struct timespec *, + const struct timespec *); + +#define BEGIN_TIME(p) \ + do { \ + assert(clock_gettime(TIME_UTIL_CLOCK, (p)) == 0); \ + } while (0) + +#if CONCRETELANG_PARALLEL_EXECUTION_ENABLED +#define END_TIME(p, m) \ + do { \ + struct timespec _end_time_tv; \ + assert(clock_gettime(TIME_UTIL_CLOCK, &_end_time_tv) == 0); \ + assert(timespec_diff((p), &_end_time_tv, (p)) == 0); \ + std::cout << "[NODE \t" << _dfr_debug_get_node_id() << "] \t" << (m) \ + << " time : \t" << (p)->tv_sec << "." << (p)->tv_nsec \ + << " seconds.\n" \ + << std::flush; \ + } while (0) +#else +#define END_TIME(p, m) \ + do { \ + struct timespec _end_time_tv; \ + assert(clock_gettime(TIME_UTIL_CLOCK, &_end_time_tv) == 0); \ + assert(timespec_diff((p), &_end_time_tv, (p)) == 0); \ + std::cout << (m) << " time : \t" << (p)->tv_sec << "." << (p)->tv_nsec \ + << " seconds.\n" \ + << std::flush; \ + } while (0) +#endif + +static inline double get_thread_cpu_time(void) { + struct timespec _tv; + double _t; + + assert(clock_gettime(CLOCK_THREAD_CPUTIME_ID, &_tv) == 0); + _t = _tv.tv_sec; + _t += _tv.tv_nsec * 1e-9; + return _t; +} + +static inline int timespec_diff(struct timespec *_result, + const struct timespec *_px, + const struct timespec *_py) { + struct timespec _x, _y; + + _x = *_px; + _y = *_py; + + /* Perform the carry for the later subtraction by updating y. */ + if (_x.tv_nsec < _y.tv_nsec) { + long _ns = (_y.tv_nsec - _x.tv_nsec) / 1000000000L + 1; + _y.tv_nsec -= 1000000000L * _ns; + _y.tv_sec += _ns; + } + if (_x.tv_nsec - _y.tv_nsec > 1000000000L) { + long _ns = (_x.tv_nsec - _y.tv_nsec) / 1000000000L; + _y.tv_nsec += 1000000000L * _ns; + _y.tv_sec -= _ns; + } + + /* Compute the time remaining to wait. tv_nsec is certainly + positive. */ + _result->tv_sec = _x.tv_sec - _y.tv_sec; + _result->tv_nsec = _x.tv_nsec - _y.tv_nsec; + + /* Return 1 if result is negative. */ + return _x.tv_sec < _y.tv_sec; +} + +#else // CONCRETELANG_TIMING_ENABLED + +#define BEGIN_TIME(p) +#define END_TIME(p, m) + +#endif // CONCRETELANG_TIMING_ENABLED +#endif diff --git a/compiler/lib/Dialect/RT/Analysis/LowerDataflowTasksToRT.cpp b/compiler/lib/Dialect/RT/Analysis/LowerDataflowTasksToRT.cpp index e9071be0f..2b5b393de 100644 --- a/compiler/lib/Dialect/RT/Analysis/LowerDataflowTasksToRT.cpp +++ b/compiler/lib/Dialect/RT/Analysis/LowerDataflowTasksToRT.cpp @@ -464,41 +464,40 @@ struct LowerDataflowTasksPass registerWorkFunction(entryPoint, wf); // Issue _dfr_start/stop calls for this function - if (!workFunctions.empty()) { - OpBuilder builder(entryPoint.getBody()); - builder.setInsertionPointToStart(&entryPoint.getBody().front()); + OpBuilder builder(entryPoint.getBody()); + builder.setInsertionPointToStart(&entryPoint.getBody().front()); + int useDFR = (workFunctions.empty()) ? 0 : 1; + Value useDFRVal = builder.create( + entryPoint.getLoc(), builder.getI64IntegerAttr(useDFR)); - if (ctxIndex >= 0) { - auto startFunTy = - (dfr::_dfr_is_root_node()) - ? mlir::FunctionType::get( - entryPoint->getContext(), - {entryPoint.getArgument(ctxIndex).getType()}, {}) - : mlir::FunctionType::get(entryPoint->getContext(), {}, {}); - (void)insertForwardDeclaration(entryPoint, builder, "_dfr_start_c", - startFunTy); - builder.create( - entryPoint.getLoc(), "_dfr_start_c", mlir::TypeRange(), - (dfr::_dfr_is_root_node()) ? entryPoint.getArgument(ctxIndex) - : mlir::ValueRange()); - } else { - auto startFunTy = - mlir::FunctionType::get(entryPoint->getContext(), {}, {}); - (void)insertForwardDeclaration(entryPoint, builder, "_dfr_start", - startFunTy); - builder.create(entryPoint.getLoc(), "_dfr_start", - mlir::TypeRange(), - mlir::ValueRange()); - } - builder.setInsertionPoint(entryPoint.getBody().back().getTerminator()); - auto stopFunTy = - mlir::FunctionType::get(entryPoint->getContext(), {}, {}); - (void)insertForwardDeclaration(entryPoint, builder, "_dfr_stop", - stopFunTy); - builder.create(entryPoint.getLoc(), "_dfr_stop", - mlir::TypeRange(), - mlir::ValueRange()); + if (ctxIndex >= 0) { + auto startFunTy = + (dfr::_dfr_is_root_node()) + ? mlir::FunctionType::get( + entryPoint->getContext(), + {entryPoint.getArgument(ctxIndex).getType()}, {}) + : mlir::FunctionType::get(entryPoint->getContext(), {}, {}); + (void)insertForwardDeclaration(entryPoint, builder, "_dfr_start_c", + startFunTy); + builder.create( + entryPoint.getLoc(), "_dfr_start_c", mlir::TypeRange(), + (dfr::_dfr_is_root_node()) ? entryPoint.getArgument(ctxIndex) + : mlir::ValueRange()); + } else { + auto startFunTy = mlir::FunctionType::get(entryPoint->getContext(), + {useDFRVal.getType()}, {}); + (void)insertForwardDeclaration(entryPoint, builder, "_dfr_start", + startFunTy); + builder.create(entryPoint.getLoc(), "_dfr_start", + mlir::TypeRange(), useDFRVal); } + builder.setInsertionPoint(entryPoint.getBody().back().getTerminator()); + auto stopFunTy = mlir::FunctionType::get(entryPoint->getContext(), + {useDFRVal.getType()}, {}); + (void)insertForwardDeclaration(entryPoint, builder, "_dfr_stop", + stopFunTy); + builder.create(entryPoint.getLoc(), "_dfr_stop", + mlir::TypeRange(), useDFRVal); } } LowerDataflowTasksPass(bool debug) : debug(debug){}; diff --git a/compiler/lib/Runtime/DFRuntime.cpp b/compiler/lib/Runtime/DFRuntime.cpp index 23d123c85..4bec7ea28 100644 --- a/compiler/lib/Runtime/DFRuntime.cpp +++ b/compiler/lib/Runtime/DFRuntime.cpp @@ -22,6 +22,7 @@ #include "concretelang/Runtime/DFRuntime.hpp" #include "concretelang/Runtime/distributed_generic_task_server.hpp" #include "concretelang/Runtime/runtime_api.h" +#include "concretelang/Runtime/time_util.h" namespace mlir { namespace concretelang { @@ -31,6 +32,7 @@ static std::vector gcc; static hpx::lcos::barrier *_dfr_jit_phase_barrier; static hpx::lcos::barrier *_dfr_startup_barrier; static size_t num_nodes = 0; +static struct timespec init_timer, broadcast_timer, compute_timer, whole_timer; } // namespace } // namespace dfr } // namespace concretelang @@ -1129,72 +1131,92 @@ static inline void _dfr_start_impl(int argc, char *argv[]) { /* Start/stop functions to be called from within user code (or during JIT invocation). These serve to pause/resume the runtime scheduler and to clean up used resources. */ -void _dfr_start() { - // The first invocation will initialise the runtime. As each call to - // _dfr_start is matched with _dfr_stop, if this is not hte first, - // we need to resume the HPX runtime. - assert( - mlir::concretelang::dfr::init_guard != - mlir::concretelang::dfr::terminated && - "DFR runtime: attempting to start runtime after it has been terminated"); - uint64_t expected = mlir::concretelang::dfr::uninitialised; - if (mlir::concretelang::dfr::init_guard.compare_exchange_strong( - expected, mlir::concretelang::dfr::active)) - _dfr_start_impl(0, nullptr); +void _dfr_start(int use_dfr_p) { + BEGIN_TIME(&mlir::concretelang::dfr::whole_timer); + if (use_dfr_p) { + BEGIN_TIME(&mlir::concretelang::dfr::init_timer); + // The first invocation will initialise the runtime. As each call to + // _dfr_start is matched with _dfr_stop, if this is not hte first, + // we need to resume the HPX runtime. + assert(mlir::concretelang::dfr::init_guard != + mlir::concretelang::dfr::terminated && + "DFR runtime: attempting to start runtime after it has been " + "terminated"); + uint64_t expected = mlir::concretelang::dfr::uninitialised; + if (mlir::concretelang::dfr::init_guard.compare_exchange_strong( + expected, mlir::concretelang::dfr::active)) + _dfr_start_impl(0, nullptr); + END_TIME(&mlir::concretelang::dfr::init_timer, "Initialization"); - assert(mlir::concretelang::dfr::init_guard == - mlir::concretelang::dfr::active && - "DFR runtime failed to initialise"); + assert(mlir::concretelang::dfr::init_guard == + mlir::concretelang::dfr::active && + "DFR runtime failed to initialise"); - // If this is not the root node in a non-JIT execution, then this - // node should only run the scheduler for any incoming work until - // termination is flagged. If this is JIT, we need to run the - // cancelled function which registers the work functions. - if (!mlir::concretelang::dfr::_dfr_is_root_node() && - !mlir::concretelang::dfr::_dfr_is_jit()) - _dfr_stop_impl(); + if (use_dfr_p == 1) { + BEGIN_TIME(&mlir::concretelang::dfr::compute_timer); + } + + // If this is not the root node in a non-JIT execution, then this + // node should only run the scheduler for any incoming work until + // termination is flagged. If this is JIT, we need to run the + // cancelled function which registers the work functions. + if (!mlir::concretelang::dfr::_dfr_is_root_node() && + !mlir::concretelang::dfr::_dfr_is_jit()) + _dfr_stop_impl(); + } } // Startup entry point when a RuntimeContext is used void _dfr_start_c(void *ctx) { - _dfr_start(); + _dfr_start(2); - new mlir::concretelang::dfr::RuntimeContextManager(); - mlir::concretelang::dfr::_dfr_node_level_runtime_context_manager->setContext( - ctx); + if (mlir::concretelang::dfr::num_nodes > 1) { + BEGIN_TIME(&mlir::concretelang::dfr::broadcast_timer); + new mlir::concretelang::dfr::RuntimeContextManager(); + mlir::concretelang::dfr::_dfr_node_level_runtime_context_manager + ->setContext(ctx); - // If this is not JIT, then the remote nodes never reach _dfr_stop, - // so root should not instantiate this barrier. - if (mlir::concretelang::dfr::_dfr_is_root_node() && - mlir::concretelang::dfr::_dfr_is_jit()) - mlir::concretelang::dfr::_dfr_startup_barrier->wait(); + // If this is not JIT, then the remote nodes never reach _dfr_stop, + // so root should not instantiate this barrier. + if (mlir::concretelang::dfr::_dfr_is_root_node() && + mlir::concretelang::dfr::_dfr_is_jit()) + mlir::concretelang::dfr::_dfr_startup_barrier->wait(); + END_TIME(&mlir::concretelang::dfr::broadcast_timer, "Key broadcasting"); + } + BEGIN_TIME(&mlir::concretelang::dfr::compute_timer); } // This function cannot be used to terminate the runtime as it is // non-decidable if another computation phase will follow. Instead the // _dfr_terminate function provides this facility and is normally // called on exit from "main" when not using the main wrapper library. -void _dfr_stop() { - // Non-root nodes synchronize here with the root to mark the point - // where the root is free to send work out (only needed in JIT). - if (!mlir::concretelang::dfr::_dfr_is_root_node()) - mlir::concretelang::dfr::_dfr_startup_barrier->wait(); +void _dfr_stop(int use_dfr_p) { + if (use_dfr_p) { + if (mlir::concretelang::dfr::num_nodes > 1) { + // Non-root nodes synchronize here with the root to mark the point + // where the root is free to send work out (only needed in JIT). + if (!mlir::concretelang::dfr::_dfr_is_root_node()) + mlir::concretelang::dfr::_dfr_startup_barrier->wait(); - // The barrier is only needed to synchronize the different - // computation phases when the compute nodes need to generate and - // register new work functions in each phase. + // The barrier is only needed to synchronize the different + // computation phases when the compute nodes need to generate and + // register new work functions in each phase. - // TODO: this barrier may be removed based on how work function - // registration is handled - but it is unlikely to result in much - // gain as the root node would be waiting for the end of computation - // on all remote nodes before reaching here anyway (dataflow - // dependences). - if (mlir::concretelang::dfr::_dfr_is_jit()) { - mlir::concretelang::dfr::_dfr_jit_phase_barrier->wait(); + // TODO: this barrier may be removed based on how work function + // registration is handled - but it is unlikely to result in much + // gain as the root node would be waiting for the end of computation + // on all remote nodes before reaching here anyway (dataflow + // dependences). + if (mlir::concretelang::dfr::_dfr_is_jit()) { + mlir::concretelang::dfr::_dfr_jit_phase_barrier->wait(); + } + + mlir::concretelang::dfr::_dfr_node_level_runtime_context_manager + ->clearContext(); + } + END_TIME(&mlir::concretelang::dfr::compute_timer, "Compute"); } - - mlir::concretelang::dfr::_dfr_node_level_runtime_context_manager - ->clearContext(); + END_TIME(&mlir::concretelang::dfr::whole_timer, "Total execution"); } void _dfr_try_initialize() { @@ -1266,6 +1288,7 @@ void _dfr_print_debug(size_t val) { #else // CONCRETELANG_PARALLEL_EXECUTION_ENABLED #include "concretelang/Runtime/DFRuntime.hpp" +#include "concretelang/Runtime/time_util.h" namespace mlir { namespace concretelang { @@ -1273,6 +1296,7 @@ namespace dfr { namespace { static bool is_jit_p = false; static bool use_omp_p = false; +static struct timespec compute_timer; } // namespace void _dfr_set_required(bool is_required) {} @@ -1281,9 +1305,18 @@ void _dfr_set_use_omp(bool use_omp) { use_omp_p = use_omp; } bool _dfr_is_jit() { return is_jit_p; } bool _dfr_is_root_node() { return true; } bool _dfr_use_omp() { return use_omp_p; } + } // namespace dfr } // namespace concretelang } // namespace mlir +void _dfr_start(int use_dfr_p) { + BEGIN_TIME(&mlir::concretelang::dfr::compute_timer); +} +void _dfr_start_c(void *ctx) { _dfr_start(2); } +void _dfr_stop(int use_dfr_p) { + END_TIME(&mlir::concretelang::dfr::compute_timer, "Compute"); +} + void _dfr_terminate() {} #endif