From fbd0b4e4bd37b27fdd03eab4b09885ffa2928973 Mon Sep 17 00:00:00 2001 From: Koren-Brand Date: Mon, 5 Aug 2024 09:32:58 +0300 Subject: [PATCH] added framework for generic small tasks thread pool that has yet to be verified against another model (i.e. gpu) --- icicle_v3/backend/cpu/src/curve/cpu_msm.cpp | 460 +++++++++--------- icicle_v3/backend/cpu/src/curve/cpu_msm.hpp | 219 +-------- .../backend/cpu/src/curve/dummy_classes.cpp | 162 ++++++ .../backend/cpu/src/curve/tasks_manager.cpp | 178 +++++++ .../backend/cpu/src/curve/tasks_manager.hpp | 66 +++ icicle_v3/tests/test_curve_api.cpp | 155 ++---- 6 files changed, 683 insertions(+), 557 deletions(-) create mode 100644 icicle_v3/backend/cpu/src/curve/dummy_classes.cpp create mode 100644 icicle_v3/backend/cpu/src/curve/tasks_manager.cpp create mode 100644 icicle_v3/backend/cpu/src/curve/tasks_manager.hpp diff --git a/icicle_v3/backend/cpu/src/curve/cpu_msm.cpp b/icicle_v3/backend/cpu/src/curve/cpu_msm.cpp index 22c7a96d..19b5b9ff 100644 --- a/icicle_v3/backend/cpu/src/curve/cpu_msm.cpp +++ b/icicle_v3/backend/cpu/src/curve/cpu_msm.cpp @@ -268,86 +268,6 @@ eIcicleError cpu_msm_single_thread( return eIcicleError::SUCCESS; } -template -ThreadTask::ThreadTask() : return_idx(-1), p1(Point::zero()), p2(Point::zero()), result(Point::zero()) -{ -} - -template -ThreadTask::ThreadTask( - const ThreadTask& other) // TODO delete when changing to task array instead of vector - : return_idx(-1), p1(Point::zero()), p2(Point::zero()), result(Point::zero()) -{ -} - -template -void ThreadTask::run(int tid, std::vector& idle_idxs, bool& kill_thread) -{ - bool rdy_status = in_ready.load(std::memory_order_acquire); - if (!rdy_status) idle_idxs.push_back(pidx); // TODO remove when finishing debugging - while (!rdy_status) { - std::this_thread::sleep_for(std::chrono::microseconds(10)); - rdy_status = in_ready.load(std::memory_order_acquire); - if (kill_thread) return; - } - in_ready.store(false, std::memory_order_release); - result = p1 + p2; - out_done.store(true, std::memory_order_release); -} - -template -void ThreadTask::new_task(const int in_idx, const Point& in_p1, const Point& in_p2) -{ - out_done.store(false, std::memory_order_release); - return_idx = in_idx; - p1 = in_p1; // Copied by value to not be linked to the original bucket - p2 = in_p2; - in_ready.store(true, std::memory_order_release); -} - -template -void ThreadTask::chain_task(const Point in_p2) -{ - // std::unique_lock temp_lock(idle_mtx); - out_done.store(false, std::memory_order_release); - p2 = in_p2; - p1 = result; - in_ready.store(true, std::memory_order_release); - // idle.notify_one(); -} - -template -WorkThread::~WorkThread() -{ - kill_thread = true; - thread.join(); -} - -template -void WorkThread::thread_setup(const int tid, const int task_per_thread) -{ - this->tid = tid; - for (int j = 0; j < task_per_thread; j++) - tasks.push_back(ThreadTask()); // TODO change to array - thread = std::thread( - &WorkThread::add_ec_tasks, this, std::ref(kill_thread)); // TODO kill_thread is accessible from this -} - -template -void WorkThread::add_ec_tasks(bool& kill_thread) -{ - while (!kill_thread) { - int i = 0; - for (ThreadTask& task : tasks) { - task.run(tid, idle_idxs, kill_thread); -#ifdef DEBUG_PRINTS - if (tid == 2) std::cout << i << ", bkt_idx=" << tasks[task_round_robin].return_idx << "\tDone\n"; - i++; -#endif - } - } -} - template Point int_mult(Point p, int x) { @@ -370,14 +290,13 @@ Msm::Msm(const MSMConfig& config) c(config.ext->get("c")), // TODO calculate instead of param precomp_f(config.precompute_factor), num_bms(((sca_test::NBITS - 1) / (config.precompute_factor * c)) + 1), are_scalars_mont(config.are_scalars_montgomery_form), are_points_mont(config.are_points_montgomery_form), - kill_threads(false), + manager(config.ext->get("n_threads")), #ifdef DEBUG_PRINTS trace_f("trace_bucket_multi.txt"), // TODO delete #endif - thread_round_robin(0), num_bkts(1 << (c - 1)), - log_num_segments(std::max((int)std::ceil(std::log2(n_threads / num_bms)), 0)), - num_bm_segments(1 << log_num_segments), segment_size(num_bkts >> log_num_segments), - tasks_per_thread(std::max(((int)(num_bms / n_threads)) << (log_num_segments + 1), 2)) + num_bkts(1 << (c - 1)), + log_num_segments(std::max((int)std::ceil(std::log2(n_threads * TASKS_PER_THREAD / (2 * num_bms))), 0)), + num_bm_segments(1 << log_num_segments), segment_size(num_bkts >> log_num_segments) { // Phase 1 bkts = new Point[num_bms * num_bkts]; @@ -395,9 +314,9 @@ Msm::Msm(const MSMConfig& config) bm_sums = new Point[num_bms]; - threads = new WorkThread[n_threads]; - for (int i = 0; i < n_threads; i++) - threads[i].thread_setup(i, tasks_per_thread); + // threads = new WorkThread[n_threads]; + // for (int i = 0; i < n_threads; i++) + // threads[i].thread_setup(i, tasks_per_thread); #ifdef DEBUG_PRINTS if (!trace_f.good()) { throw std::invalid_argument("Can't open file"); } // TODO remove log @@ -410,12 +329,11 @@ Msm::~Msm() std::cout << "\n\nDestroying msm object at the end of the run\n\n"; // COMMENT why am I not seeing it one the console? // Isn't the destructor automatically called when // msm goes out of scope? - kill_threads = true; // TODO check if it is used after kill_thread has been added to WorkThread destructor // for (int i = 0; i < n_threads; i++) threads[i].thread.join(); #ifdef DEBUG_PRINTS trace_f.close(); #endif - delete[] threads; + // delete[] threads; delete[] bkts; delete[] bkts_occupancy; @@ -445,82 +363,117 @@ void Msm::wait_for_idle() * @param bkts_occupancy - boolean array holding the bkts_occupancy of each bucket * @return boolean, a flag indicating all threads are idle */ - bool all_threads_idle = false; + ECaddTask* task = nullptr; + manager.get_completed_task(task); int count = 0; - while (!all_threads_idle) { - all_threads_idle = true; - - for (int i = 0; i < n_threads; i++) { - int og_task_round_robin = threads[i].task_round_robin; - + while (task != nullptr) + { + // std::cout << count << ":\tNew completed task.\tstatus=" << task->status << " (Idle = " << task->is_idle() << ")" << "\n"; + count++; + if (bkts_occupancy[task->return_idx]) + { + task->p1 = task->result; + task->p2 = bkts[task->return_idx]; + bkts_occupancy[task->return_idx] = false; #ifdef DEBUG_PRINTS - trace_f << "Finishing thread " << i << ", starting at task: " << og_task_round_robin << '\n'; + trace_f << '#' << task->return_idx << ":\tFCollision addition - bkts' cell:\t" + << Point::to_affine(bkts[task->return_idx]).x << "\t(With add res point:\t" + << Point::to_affine(task->result).x << " = " << Point::to_affine(bkts[task->return_idx] + task->result).x + << ")\t(" << task << ")\n"; #endif - - for (int j = og_task_round_robin; j < og_task_round_robin + tasks_per_thread; j++) { - int task_idx = j; - if (task_idx >= tasks_per_thread) task_idx -= tasks_per_thread; - - // For readability - ThreadTask& task = threads[i].tasks[task_idx]; - if (task.out_done.load(std::memory_order_acquire)) { - if (task.return_idx >= 0) { - if (bkts_occupancy[task.return_idx]) { -#ifdef DEBUG_PRINTS - trace_f << '#' << task.return_idx << ":\tFCollision addition - bkts' cell:\t" - << Point::to_affine(bkts[task.return_idx]).x << "\t(With add res point:\t" - << Point::to_affine(task.result).x << " = " - << Point::to_affine(bkts[task.return_idx] + task.result).x << ")\t(" << i << ',' << task_idx - << ")\n"; -#endif - // std::cout << "\n" << i << ":\t(" << task_idx << "->" << threads[i].task_round_robin << - // ")\tChaining\n\n"; - bkts_occupancy[task.return_idx] = false; - int bkt_idx = task.return_idx; - task.return_idx = -1; - - threads[i].tasks[threads[i].task_round_robin].new_task(bkt_idx, task.result, bkts[bkt_idx]); - if (threads[i].task_round_robin == tasks_per_thread - 1) - threads[i].task_round_robin = 0; - else - threads[i].task_round_robin++; - - all_threads_idle = false; // This thread isn't idle due to the newly assigned task - } else { - bkts[task.return_idx] = task.result; -#ifdef DEBUG_PRINTS - trace_f << '#' << task.return_idx << ":\tFWrite (res) free cell:\t" - << Point::to_affine(bkts[task.return_idx]).x << "\t(" << i << ',' << task_idx << ")\n"; -#endif - bkts_occupancy[task.return_idx] = true; - task.return_idx = -1; // To ensure no repeated handling of outputs - } - } else { -#ifdef DEBUG_PRINTS - trace_f << "Task " << task_idx << " idle\n"; -#endif - } - } else { - all_threads_idle = false; - break; -#ifdef DEBUG_PRINTS - trace_f << '#' << task.return_idx << ":\t(" << i << ',' << task_idx << ") not done\tres=" << task.result - << "\tstatus:" << task.in_ready << ',' << task.out_done << '\n'; -#endif - } - } + task->dispatch(); } - // std::this_thread::sleep_for(std::chrono::milliseconds(5000)); + else bkts[task->return_idx] = task->result; + manager.get_completed_task(task); } - // trace_f.flush(); } +// template +// void Msm::old_wait_for_idle() +// { +// /** +// * Handle thread outputs and possible collisions between results and occupied bkts +// * @param threads - working threads array +// * @param n_threads - size of the above array +// * @param bkts - bucket array to store results / add stored value with result +// * @param bkts_occupancy - boolean array holding the bkts_occupancy of each bucket +// * @return boolean, a flag indicating all threads are idle +// */ +// bool all_threads_idle = false; +// int count = 0; +// while (!all_threads_idle) { +// all_threads_idle = true; + +// for (int i = 0; i < n_threads; i++) { +// int og_task_round_robin = threads[i].task_round_robin; + +// #ifdef DEBUG_PRINTS +// trace_f << "Finishing thread " << i << ", starting at task: " << og_task_round_robin << '\n'; +// #endif + +// for (int j = og_task_round_robin; j < og_task_round_robin + tasks_per_thread; j++) { +// int task_idx = j; +// if (task_idx >= tasks_per_thread) task_idx -= tasks_per_thread; + +// // For readability +// ThreadTask& task = threads[i].tasks[task_idx]; +// if (task.out_done.load(std::memory_order_acquire)) { +// if (task.return_idx >= 0) { +// if (bkts_occupancy[task.return_idx]) { +// #ifdef DEBUG_PRINTS +// trace_f << '#' << task.return_idx << ":\tFCollision addition - bkts' cell:\t" +// << Point::to_affine(bkts[task.return_idx]).x << "\t(With add res point:\t" +// << Point::to_affine(task.result).x << " = " +// << Point::to_affine(bkts[task.return_idx] + task.result).x << ")\t(" << i << ',' << task_idx +// << ")\n"; +// #endif +// // std::cout << "\n" << i << ":\t(" << task_idx << "->" << threads[i].task_round_robin << +// // ")\tChaining\n\n"; +// bkts_occupancy[task.return_idx] = false; +// int bkt_idx = task.return_idx; +// task.return_idx = -1; + +// threads[i].tasks[threads[i].task_round_robin].new_task(bkt_idx, task.result, bkts[bkt_idx]); +// if (threads[i].task_round_robin == tasks_per_thread - 1) +// threads[i].task_round_robin = 0; +// else +// threads[i].task_round_robin++; + +// all_threads_idle = false; // This thread isn't idle due to the newly assigned task +// } else { +// bkts[task.return_idx] = task.result; +// #ifdef DEBUG_PRINTS +// trace_f << '#' << task.return_idx << ":\tFWrite (res) free cell:\t" +// << Point::to_affine(bkts[task.return_idx]).x << "\t(" << i << ',' << task_idx << ")\n"; +// #endif +// bkts_occupancy[task.return_idx] = true; +// task.return_idx = -1; // To ensure no repeated handling of outputs +// } +// } else { +// #ifdef DEBUG_PRINTS +// trace_f << "Task " << task_idx << " idle\n"; +// #endif +// } +// } else { +// all_threads_idle = false; +// break; +// #ifdef DEBUG_PRINTS +// trace_f << '#' << task.return_idx << ":\t(" << i << ',' << task_idx << ") not done\tres=" << task.result +// << "\tstatus:" << task.in_ready << ',' << task.out_done << '\n'; +// #endif +// } +// } +// } +// // std::this_thread::sleep_for(std::chrono::milliseconds(5000)); +// } +// // trace_f.flush(); +// } + template -template void Msm::phase1_push_addition( const unsigned int task_bkt_idx, const Point bkt, - const Base& base, + const Point& base, int pidx) // TODO add option of adding different types { /** @@ -530,72 +483,120 @@ void Msm::phase1_push_addition( * an overwritten without affecting the working thread * @param p2 - point to be added */ - int count_thread_iters = 0; - bool assigned_to_thread = false; - while (!assigned_to_thread) { - // For readability - ThreadTask& task = threads[thread_round_robin].tasks[threads[thread_round_robin].task_round_robin]; - if (task.out_done.load(std::memory_order_acquire)) { - num_additions++; - if (task.return_idx >= 0) { - if (bkts_occupancy[task.return_idx]) { + ECaddTask* task = nullptr; // TODO actually use the 2 types according to the phase and remove templates + while (task == nullptr) + { + bool holds_result = manager.get_free_task(task); + if (holds_result) + { + if (bkts_occupancy[task->return_idx]) + { + task->p1 = task->result; + task->p2 = bkts[task->return_idx]; + bkts_occupancy[task->return_idx] = false; #ifdef DEBUG_PRINTS - trace_f << '#' << task.return_idx << ":\tCollision addition - bkts' cell:\t" - << Point::to_affine(bkts[task.return_idx]).x << "\t(With add res point:\t" - << Point::to_affine(task.result).x << " = " << Point::to_affine(bkts[task.return_idx] + task.result).x - << ")\t(" << thread_round_robin << ',' << threads[thread_round_robin].task_round_robin << ")\n"; + trace_f << '#' << task->return_idx << ":\tCollision addition - bkts' cell:\t" + << Point::to_affine(bkts[task->return_idx]).x << "\t(With add res point:\t" + << Point::to_affine(task->result).x << " = " << Point::to_affine(bkts[task->return_idx] + task->result).x + << ")\t(" << task << ")\n"; #endif - bkts_occupancy[task.return_idx] = false; - ; - task.pidx = pidx; - task.chain_task(bkts[task.return_idx]); - } else { -#ifdef DEBUG_PRINTS - trace_f << '#' << task.return_idx << ":\tWrite (res) free cell:\t" << Point::to_affine(task.result).x << "\t(" - << thread_round_robin << ',' << threads[thread_round_robin].task_round_robin << ")\n"; -#endif - bkts[task.return_idx] = task.result; - bkts_occupancy[task.return_idx] = true; - task.pidx = pidx; - task.new_task(task_bkt_idx, bkt, Point::from_affine(base)); // TODO support multiple types - assigned_to_thread = true; -#ifdef DEBUG_PRINTS - trace_f << '#' << task_bkt_idx << ":\tAssigned to:\t(" << thread_round_robin << ',' - << threads[thread_round_robin].task_round_robin << ")\n"; -// trace_f.flush(); -#endif - // break; - } - } else { - task.pidx = pidx; - task.new_task(task_bkt_idx, bkt, Point::from_affine(base)); // TODO support multiple types - assigned_to_thread = true; -#ifdef DEBUG_PRINTS - trace_f << '#' << task_bkt_idx << ":\tAssigned to:\t(" << thread_round_robin << ',' - << threads[thread_round_robin].task_round_robin << ")\n"; -// trace_f.flush(); -#endif - // break; + task->dispatch(); + task = nullptr; + continue; } - if (threads[thread_round_robin].task_round_robin == tasks_per_thread - 1) - threads[thread_round_robin].task_round_robin = 0; - else - threads[thread_round_robin].task_round_robin++; - } - - // Move to next thread after checking all current thread's tasks - if (thread_round_robin == n_threads - 1) - thread_round_robin = 0; - else - thread_round_robin++; - count_thread_iters++; - if (count_thread_iters == n_threads) { - count_thread_iters = 0; - loop_count++; + else bkts[task->return_idx] = task->result; } + task->p1 = bkt; + task->p2 = base; + task->return_idx = task_bkt_idx; +#ifdef DEBUG_PRINTS + trace_f << '#' << task_bkt_idx << ":\tAssigned to:\t(" << task << ")\n"; +#endif + task->dispatch(); } } +// template +// template +// void Msm::old_phase1_push_addition( +// const unsigned int task_bkt_idx, +// const Point bkt, +// const Base& base, +// int pidx) // TODO add option of adding different types +// { +// /** +// * Assign EC addition to a free thread +// * @param task_bkt_idx - results address in the bkts array +// * @param bkt - bkt to be added. it is passed by value to allow the appropriate cell in the bucket array to be "free" +// * an overwritten without affecting the working thread +// * @param p2 - point to be added +// */ +// int count_thread_iters = 0; +// bool assigned_to_thread = false; +// while (!assigned_to_thread) { +// // For readability +// ThreadTask& task = threads[thread_round_robin].tasks[threads[thread_round_robin].task_round_robin]; +// if (task.out_done.load(std::memory_order_acquire)) { +// num_additions++; +// if (task.return_idx >= 0) { +// if (bkts_occupancy[task.return_idx]) { +// #ifdef DEBUG_PRINTS +// trace_f << '#' << task.return_idx << ":\tCollision addition - bkts' cell:\t" +// << Point::to_affine(bkts[task.return_idx]).x << "\t(With add res point:\t" +// << Point::to_affine(task.result).x << " = " << Point::to_affine(bkts[task.return_idx] + task.result).x +// << ")\t(" << thread_round_robin << ',' << threads[thread_round_robin].task_round_robin << ")\n"; +// #endif +// bkts_occupancy[task.return_idx] = false; +// ; +// task.pidx = pidx; +// task.chain_task(bkts[task.return_idx]); +// } else { +// #ifdef DEBUG_PRINTS +// trace_f << '#' << task.return_idx << ":\tWrite (res) free cell:\t" << Point::to_affine(task.result).x << "\t(" +// << thread_round_robin << ',' << threads[thread_round_robin].task_round_robin << ")\n"; +// #endif +// bkts[task.return_idx] = task.result; +// bkts_occupancy[task.return_idx] = true; +// task.pidx = pidx; +// task.new_task(task_bkt_idx, bkt, Point::from_affine(base)); // TODO support multiple types +// assigned_to_thread = true; +// #ifdef DEBUG_PRINTS +// trace_f << '#' << task_bkt_idx << ":\tAssigned to:\t(" << thread_round_robin << ',' +// << threads[thread_round_robin].task_round_robin << ")\n"; +// // trace_f.flush(); +// #endif +// // break; +// } +// } else { +// task.pidx = pidx; +// task.new_task(task_bkt_idx, bkt, Point::from_affine(base)); // TODO support multiple types +// assigned_to_thread = true; +// #ifdef DEBUG_PRINTS +// trace_f << '#' << task_bkt_idx << ":\tAssigned to:\t(" << thread_round_robin << ',' +// << threads[thread_round_robin].task_round_robin << ")\n"; +// // trace_f.flush(); +// #endif +// // break; +// } +// if (threads[thread_round_robin].task_round_robin == tasks_per_thread - 1) +// threads[thread_round_robin].task_round_robin = 0; +// else +// threads[thread_round_robin].task_round_robin++; +// } + +// // Move to next thread after checking all current thread's tasks +// if (thread_round_robin == n_threads - 1) +// thread_round_robin = 0; +// else +// thread_round_robin++; +// count_thread_iters++; +// if (count_thread_iters == n_threads) { +// count_thread_iters = 0; +// loop_count++; +// } +// } +// } + template Point* Msm::bucket_accumulator(const sca_test* scalars, const aff_test* bases, const unsigned int msm_size) { @@ -612,8 +613,7 @@ Point* Msm::bucket_accumulator(const sca_test* scalars, const aff_test* b const int num_windows_m1 = (sca_test::NBITS - 1) / c; // +1 for ceiling than -1 for m1 int carry = 0; - std::cout << "\n\nc=" << c << "\tpcf=" << precomp_f << "\tnum bms=" << num_bms << "\tntrds,tasks=" << n_threads << ',' - << tasks_per_thread << "\n\n\n"; + std::cout << "\n\nc=" << c << "\tpcf=" << precomp_f << "\tnum bms=" << num_bms << "\tntrds,tasks=" << n_threads << "\n\n\n"; std::cout << log_num_segments << '\n' << segment_size << "\n\n\n"; for (int i = 0; i < msm_size; i++) { carry = 0; @@ -645,7 +645,7 @@ Point* Msm::bucket_accumulator(const sca_test* scalars, const aff_test* b << Point::to_affine(bkts[bkt_idx] + (carry > 0 ? aff_test::neg(point) : point)).x << ")\n"; // trace_f.flush(); #endif - phase1_push_addition(bkt_idx, bkts[bkt_idx], carry > 0 ? aff_test::neg(point) : point, i); + phase1_push_addition(bkt_idx, bkts[bkt_idx], carry > 0 ? Point::from_affine(aff_test::neg(point)) : Point::from_affine(point), i); } else { bkts_occupancy[bkt_idx] = true; bkts[bkt_idx] = carry > 0 ? Point::neg(Point::from_affine(point)) : Point::from_affine(point); @@ -667,12 +667,12 @@ Point* Msm::bucket_accumulator(const sca_test* scalars, const aff_test* b return bkts; } -template -std::tuple -Msm::phase2_push_addition(const unsigned int task_bkt_idx, const Point& bkt, const Point& base) -{ - return std::make_tuple(-1, -1); -} +// template +// std::tuple +// Msm::phase2_push_addition(const unsigned int task_bkt_idx, const Point& bkt, const Point& base); +// { +// return std::make_tuple(-1, -1); +// } template Point* Msm::bm_sum() @@ -705,8 +705,8 @@ Point* Msm::bm_sum() int assigned_thread = std::get<0>(task_assigned_to_sum[line_sum_idx]); int assigned_task = std::get<1>(task_assigned_to_sum[line_sum_idx]); if (assigned_thread >= 0) { - while (!threads[assigned_thread].task[assigned_task].out_done.load(std::memory_order_acquire)) { - } // TODO add sleep + // while (!threads[assigned_thread].task[assigned_task].out_done.load(std::memory_order_acquire)) { + // } // TODO add sleep // task_assigned_to_sum[] } @@ -766,8 +766,6 @@ eIcicleError cpu_msm_ref( sca_test scalar = config.are_scalars_montgomery_form ? sca_test::from_montgomery(scalars[i]) : scalars[i]; aff_test point = config.are_points_montgomery_form ? aff_test::from_montgomery(bases[precomp_f * i]) : bases[precomp_f * i]; - - // std::cout << scalar << "\t*\t" << point << '\n'; res = res + scalar * Point::from_affine(point); } // results[0] = config.are_points_montgomery_form? Point::to_montgomery(res) : res; @@ -789,22 +787,16 @@ eIcicleError cpu_msm_precompute_bases( const unsigned int c = config.ext->get("c"); const unsigned int num_bms_no_precomp = (sca_test::NBITS - 1) / c + 1; const unsigned int shift = c * ((num_bms_no_precomp - 1) / precompute_factor + 1); - // std::cout << "Starting precompute\n"; - // std::cout << c << ',' << shift << '\n'; for (int i = 0; i < nof_bases; i++) { output_bases[precompute_factor * i] = input_bases[i]; // COMMENT Should I copy? (not by reference) proj_test point = proj_test::from_affine(is_mont ? A::from_montgomery(input_bases[i]) : input_bases[i]); - // std::cout << "OG point[" << i << "]:\t" << point << '\n'; for (int j = 1; j < precompute_factor; j++) { for (int k = 0; k < shift; k++) { point = proj_test::dbl(point); - // std::cout << point << '\n'; } - // std::cout << "Shifted point[" << i << "]:\t" << point << "\nStored in index=" << (precompute_factor*i+j) << - // '\n'; output_bases[precompute_factor * i + j] = is_mont ? A::to_montgomery(proj_test::to_affine(point)) - : proj_test::to_affine(point); // TODO change here order of precomp + : proj_test::to_affine(point); } } return eIcicleError::SUCCESS; diff --git a/icicle_v3/backend/cpu/src/curve/cpu_msm.hpp b/icicle_v3/backend/cpu/src/curve/cpu_msm.hpp index ef92a0b0..5165a8b1 100644 --- a/icicle_v3/backend/cpu/src/curve/cpu_msm.hpp +++ b/icicle_v3/backend/cpu/src/curve/cpu_msm.hpp @@ -18,6 +18,8 @@ using namespace icicle; #include "icicle/curves/projective.h" #include "icicle/curves/curve_config.h" +#include "tasks_manager.cpp" + using aff_test = affine_t; using proj_test = projective_t; using sca_test = scalar_t; @@ -25,164 +27,7 @@ using sca_test = scalar_t; #include #include -struct MSMConfig { - int nof_bases; /**< Number of bases in the MSM for batched MSM. Set to 0 if all MSMs use the same bases or set to - * 'batch X #scalars' otherwise. Default value: 0 (that is reuse bases for all batch elements). */ - int precompute_factor; /**< The number of extra points to pre-compute for each point. See the - * [precompute_msm_bases](@ref precompute_msm_bases) function, `precompute_factor` passed - * there needs to be equal to the one used here. Larger values decrease the - * number of computations to make, on-line memory footprint, but increase the static - * memory footprint. Default value: 1 (i.e. don't pre-compute). */ - int batch_size; /**< The number of MSMs to compute. Default value: 1. */ - bool are_scalars_on_device; /**< True if scalars are on device and false if they're on host. Default value: - * false. */ - bool are_scalars_montgomery_form; /**< True if scalars are in Montgomery form and false otherwise. Default value: - * true. */ - bool are_points_on_device; /**< True if points are on device and false if they're on host. Default value: false. */ - bool are_points_montgomery_form; /**< True if coordinates of points are in Montgomery form and false otherwise. - * Default value: true. */ - bool are_results_on_device; /**< True if the results should be on device and false if they should be on host. If set - * to false, `is_async` won't take effect because a synchronization is needed to - * transfer results to the host. Default value: false. */ - bool is_async; /**< Whether to run the MSM asynchronously. If set to true, the MSM function will be - * non-blocking and you'd need to synchronize it explicitly by running - * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the MSM - * function will block the current CPU thread. */ - - ConfigExtension ext; /** backend specific extensions*/ -}; - -struct MsmPreComputeConfig { - bool is_input_on_device; - bool is_output_on_device; - bool is_async; - - ConfigExtension ext; /** backend specific extensions*/ -}; - -#define P_MACRO 1000 - -struct Device { -}; - -class Dummy_Scalar -{ -public: - static constexpr unsigned NBITS = 32; - - unsigned x; - unsigned p = P_MACRO; - // unsigned p = 1<<30; - - static Dummy_Scalar zero() { return {0}; } - - static Dummy_Scalar one() { return {1}; } - - static Dummy_Scalar to_montgomery(const Dummy_Scalar& s) { return {s.x}; } - - static Dummy_Scalar from_montgomery(const Dummy_Scalar& s) { return {s.x}; } - - friend std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar) - { - os << scalar.x; - return os; - } - - unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const - { - return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1); - } - - friend Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2) { return {(p1.x + p2.x) % p1.p}; } - - friend Dummy_Scalar operator-(Dummy_Scalar p1, const Dummy_Scalar& p2) { return p1 + neg(p2); } - - friend bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); } - - friend bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); } - - static Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; } - static Dummy_Scalar rand_host(std::mt19937_64& rand_generator) - { - // return {(unsigned)rand() % P_MACRO}; - std::uniform_int_distribution distribution(0, P_MACRO - 1); - return {distribution(rand_generator)}; - } - - static void rand_host_many(Dummy_Scalar* out, int size, std::mt19937_64& rand_generator) - { - for (int i = 0; i < size; i++) - // out[i] = (i % size < 100) ? rand_host(rand_generator) : out[i - 100]; - out[i] = rand_host(rand_generator); - } -}; - -class Dummy_Projective -{ -public: - Dummy_Scalar x; - - static Dummy_Projective zero() { return {0}; } - - static Dummy_Projective one() { return {1}; } - - static Dummy_Projective to_affine(const Dummy_Projective& point) { return {point.x}; } - - static Dummy_Projective from_affine(const Dummy_Projective& point) { return {point.x}; } - - static Dummy_Projective to_montgomery(const Dummy_Projective& point) { return {point.x}; } - - static Dummy_Projective from_montgomery(const Dummy_Projective& point) { return {point.x}; } - - static Dummy_Projective neg(const Dummy_Projective& point) { return {Dummy_Scalar::neg(point.x)}; } - - static Dummy_Projective copy(const Dummy_Projective& point) { return {point.x}; } - - friend Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2) { return {p1.x + p2.x}; } - - friend Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) { return {p1.x - p2.x}; } - - static Dummy_Projective dbl(const Dummy_Projective& point) { return {point.x + point.x}; } - - // friend Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) { - // return p1 + neg(p2); - // } - - friend std::ostream& operator<<(std::ostream& os, const Dummy_Projective& point) - { - os << point.x; - return os; - } - - friend Dummy_Projective operator*(Dummy_Scalar scalar, const Dummy_Projective& point) - { - Dummy_Projective res = zero(); -#ifdef CUDA_ARCH - UNROLL -#endif - for (int i = 0; i < Dummy_Scalar::NBITS; i++) { - if (i > 0) { res = res + res; } - if (scalar.get_scalar_digit(Dummy_Scalar::NBITS - i - 1, 1)) { res = res + point; } - } - return res; - } - - friend bool operator==(const Dummy_Projective& p1, const Dummy_Projective& p2) { return (p1.x == p2.x); } - - static bool is_zero(const Dummy_Projective& point) { return point.x == 0; } - - static Dummy_Projective rand_host(std::mt19937_64& rand_generator) - { - return {(unsigned)rand() % P_MACRO}; - // return {(unsigned)rand()}; - } - - static void rand_host_many_affine(Dummy_Projective* out, int size, std::mt19937_64& rand_generator) - { - for (int i = 0; i < size; i++) - out[i] = (i % size < 100) ? to_affine(rand_host(rand_generator)) : out[i - 100]; - } -}; +#include "dummy_classes.cpp" using aff_test = Dummy_Projective; using proj_test = Dummy_Projective; @@ -196,47 +41,18 @@ using sca_test = Dummy_Scalar; #ifndef STANDALONE using namespace curve_config; -using namespace icicle; #endif -template // TODO add support for two different point types -class ThreadTask -{ // TODO turn to class and check which members can be private (or at least function by getter/setter) +template +class ECaddTask : public TaskBase +{ public: - std::atomic in_ready{false}; + ECaddTask() : TaskBase(), p1(Point::zero()), p2(AddedPoint::zero()), result(Point::zero()), return_idx(-1) {} + virtual void execute() { result = p1 + p2; } + + Point p1, result; // TODO result will be stored in p1 and support two point types + AddedPoint p2; int return_idx; - int pidx = 0; // TODO remove after debugging done - Point p1; - Point p2; // TODO allow different types of points to be added - Point result; // TODO Remove and result will be stored in p1 - std::atomic out_done{true}; - - ThreadTask(); - ThreadTask(const ThreadTask& other); - void run(int tid, std::vector& idle_idxs, bool& kill_thread); - void new_task(const int in_idx, const Point& in_p1, const Point& in_p2); - void chain_task(const Point in_p2); - // TODO implement below to make class members private - inline bool set_ready(); - inline bool is_done(); - inline Point get_result(); - inline void set_idle(); -}; - -template -struct WorkThread { // TODO turn to class and check which members can be private (or at least function by getter/setter) - int tid; - std::vector idle_idxs; - std::thread thread; - int task_round_robin = 0; - std::vector> tasks; - std::condition_variable idle; - std::mutex idle_mtx; - bool kill_thread = false; // TODO rethink kill thread as phase already allows breaking - - ~WorkThread(); - void thread_setup(const int tid, const int task_per_thread); - void add_ec_tasks(bool& kill_thread); }; template @@ -244,14 +60,8 @@ class Msm { private: // std::vector> threads; - WorkThread* threads; + TasksManager> manager; const unsigned int n_threads; - const unsigned int tasks_per_thread; // TODO check effect of the num of tasks one p1 performance and maybe have - // separate tasks_per_thread values for p1 and p2 - int thread_round_robin; - bool any_thread_free; - - bool kill_threads; const unsigned int c; const unsigned int num_bkts; @@ -282,6 +92,7 @@ private: std::ofstream trace_f; void wait_for_idle(); + // void old_wait_for_idle(); // template // void push_addition( const unsigned int task_bkt_idx, @@ -291,8 +102,10 @@ private: // Point* result_arr, // bool* ); + void phase1_push_addition(const unsigned int task_bkt_idx, const Point bkt, const Point& base, int pidx); + template - void phase1_push_addition(const unsigned int task_bkt_idx, const Point bkt, const Base& base, int pidx); + // void old_phase1_push_addition(const unsigned int task_bkt_idx, const Point bkt, const Base& base, int pidx); std::tuple phase2_push_addition(const unsigned int task_bkt_idx, const Point& bkt, const Point& base); diff --git a/icicle_v3/backend/cpu/src/curve/dummy_classes.cpp b/icicle_v3/backend/cpu/src/curve/dummy_classes.cpp new file mode 100644 index 00000000..c7bea66f --- /dev/null +++ b/icicle_v3/backend/cpu/src/curve/dummy_classes.cpp @@ -0,0 +1,162 @@ +#include "icicle/errors.h" +#include "icicle/config_extension.h" +using namespace icicle; + +struct MSMConfig { + int nof_bases; /**< Number of bases in the MSM for batched MSM. Set to 0 if all MSMs use the same bases or set to + * 'batch X #scalars' otherwise. Default value: 0 (that is reuse bases for all batch elements). */ + int precompute_factor; /**< The number of extra points to pre-compute for each point. See the + * [precompute_msm_bases](@ref precompute_msm_bases) function, `precompute_factor` passed + * there needs to be equal to the one used here. Larger values decrease the + * number of computations to make, on-line memory footprint, but increase the static + * memory footprint. Default value: 1 (i.e. don't pre-compute). */ + int batch_size; /**< The number of MSMs to compute. Default value: 1. */ + bool are_scalars_on_device; /**< True if scalars are on device and false if they're on host. Default value: + * false. */ + bool are_scalars_montgomery_form; /**< True if scalars are in Montgomery form and false otherwise. Default value: + * true. */ + bool are_points_on_device; /**< True if points are on device and false if they're on host. Default value: false. */ + bool are_points_montgomery_form; /**< True if coordinates of points are in Montgomery form and false otherwise. + * Default value: true. */ + bool are_results_on_device; /**< True if the results should be on device and false if they should be on host. If set + * to false, `is_async` won't take effect because a synchronization is needed to + * transfer results to the host. Default value: false. */ + bool is_async; /**< Whether to run the MSM asynchronously. If set to true, the MSM function will be + * non-blocking and you'd need to synchronize it explicitly by running + * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the MSM + * function will block the current CPU thread. */ + + ConfigExtension ext; /** backend specific extensions*/ +}; + +struct MsmPreComputeConfig { + bool is_input_on_device; + bool is_output_on_device; + bool is_async; + + ConfigExtension ext; /** backend specific extensions*/ +}; + +#define P_MACRO 1000 + +struct Device { +}; + +class Dummy_Scalar +{ +public: + static constexpr unsigned NBITS = 32; + + unsigned x; + unsigned p = P_MACRO; + // unsigned p = 1<<30; + + static Dummy_Scalar zero() { return {0}; } + + static Dummy_Scalar one() { return {1}; } + + static Dummy_Scalar to_montgomery(const Dummy_Scalar& s) { return {s.x}; } + + static Dummy_Scalar from_montgomery(const Dummy_Scalar& s) { return {s.x}; } + + friend std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar) + { + os << scalar.x; + return os; + } + + unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const + { + return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1); + } + + friend Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2) { return {(p1.x + p2.x) % p1.p}; } + + friend Dummy_Scalar operator-(Dummy_Scalar p1, const Dummy_Scalar& p2) { return p1 + neg(p2); } + + friend bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); } + + friend bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); } + + static Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; } + static Dummy_Scalar rand_host(std::mt19937_64& rand_generator) + { + // return {(unsigned)rand() % P_MACRO}; + std::uniform_int_distribution distribution(0, P_MACRO - 1); + return {distribution(rand_generator)}; + } + + static void rand_host_many(Dummy_Scalar* out, int size, std::mt19937_64& rand_generator) + { + for (int i = 0; i < size; i++) + // out[i] = (i % size < 100) ? rand_host(rand_generator) : out[i - 100]; + out[i] = rand_host(rand_generator); + } +}; + +class Dummy_Projective +{ +public: + Dummy_Scalar x; + + static Dummy_Projective zero() { return {0}; } + + static Dummy_Projective one() { return {1}; } + + static Dummy_Projective to_affine(const Dummy_Projective& point) { return {point.x}; } + + static Dummy_Projective from_affine(const Dummy_Projective& point) { return {point.x}; } + + static Dummy_Projective to_montgomery(const Dummy_Projective& point) { return {point.x}; } + + static Dummy_Projective from_montgomery(const Dummy_Projective& point) { return {point.x}; } + + static Dummy_Projective neg(const Dummy_Projective& point) { return {Dummy_Scalar::neg(point.x)}; } + + static Dummy_Projective copy(const Dummy_Projective& point) { return {point.x}; } + + friend Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2) { return {p1.x + p2.x}; } + + friend Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) { return {p1.x - p2.x}; } + + static Dummy_Projective dbl(const Dummy_Projective& point) { return {point.x + point.x}; } + + // friend Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) { + // return p1 + neg(p2); + // } + + friend std::ostream& operator<<(std::ostream& os, const Dummy_Projective& point) + { + os << point.x; + return os; + } + + friend Dummy_Projective operator*(Dummy_Scalar scalar, const Dummy_Projective& point) + { + Dummy_Projective res = zero(); +#ifdef CUDA_ARCH + UNROLL +#endif + for (int i = 0; i < Dummy_Scalar::NBITS; i++) { + if (i > 0) { res = res + res; } + if (scalar.get_scalar_digit(Dummy_Scalar::NBITS - i - 1, 1)) { res = res + point; } + } + return res; + } + + friend bool operator==(const Dummy_Projective& p1, const Dummy_Projective& p2) { return (p1.x == p2.x); } + + static bool is_zero(const Dummy_Projective& point) { return point.x == 0; } + + static Dummy_Projective rand_host(std::mt19937_64& rand_generator) + { + return {(unsigned)rand() % P_MACRO}; + // return {(unsigned)rand()}; + } + + static void rand_host_many_affine(Dummy_Projective* out, int size, std::mt19937_64& rand_generator) + { + for (int i = 0; i < size; i++) + out[i] = (i % size < 100) ? to_affine(rand_host(rand_generator)) : out[i - 100]; + } +}; \ No newline at end of file diff --git a/icicle_v3/backend/cpu/src/curve/tasks_manager.cpp b/icicle_v3/backend/cpu/src/curve/tasks_manager.cpp new file mode 100644 index 00000000..f991f87f --- /dev/null +++ b/icicle_v3/backend/cpu/src/curve/tasks_manager.cpp @@ -0,0 +1,178 @@ +#include "tasks_manager.hpp" +#include +#include + +#define LOG_TASKS_PER_THREAD 2 +#define TASKS_PER_THREAD (1 << LOG_TASKS_PER_THREAD) +#define TASK_IDX_MASK (TASKS_PER_THREAD - 1) +#define MANAGER_SLEEP_USEC 10 +#define THREAD_SLEEP_USEC 1 + +inline bool TaskBase::is_ready_for_work() { + return status.load(std::memory_order_acquire) == dispatched; +} + +inline bool TaskBase::can_push_task() { + TaskStatus curr_status = status.load(std::memory_order_acquire); + return curr_status == pending_result || curr_status == idle; +} + +inline bool TaskBase::has_result() { + return status.load(std::memory_order_acquire) == pending_result; +} + +inline bool TaskBase::is_idle() { + return status.load(std::memory_order_acquire) == idle; +} + +void TaskBase::dispatch() { + assert(can_push_task()); + (*father_fifo_tail)++; + status.store(dispatched, std::memory_order_release); +} + +inline void TaskBase::set_working() { + assert(is_ready_for_work()); + status.store(working, std::memory_order_release); +} + +inline void TaskBase::set_pending_result() { + assert(status.load(std::memory_order_acquire) == working); + status.store(pending_result, std::memory_order_release); +} + +inline void TaskBase::set_handled_result() { + assert(has_result()); + status.store(idle, std::memory_order_release); +} + +void TaskBase::wait_done() { + while (has_result()) { + std::this_thread::sleep_for(std::chrono::microseconds(MANAGER_SLEEP_USEC)); + } +} + +template +TasksManager::Worker::Worker() +: mTasksFifo(TASKS_PER_THREAD), + tail(0), + head(0), + kill(false) + { + task_executor = std::thread(&TasksManager::Worker::run, this); + } + +template +TasksManager::Worker::~Worker() { + kill = true; + task_executor.join(); +} + +template +void TasksManager::Worker::run() { + while (true) { + for (head = 0; head < mTasksFifo.size(); head++) + { + Task* task = &mTasksFifo[head]; + if (!task->is_ready_for_work()) { + std::this_thread::sleep_for(std::chrono::microseconds(THREAD_SLEEP_USEC)); + continue; + } + task->set_working(); + task->execute(); + task->set_pending_result(); + } + if (kill) return; + } +} + +template +bool TasksManager::Worker::get_free_task(Task*& task_slot) { + for (int i = 0; i < mTasksFifo.size(); i++) + { + int tail_adjusted_idx = (i + tail) & TASK_IDX_MASK; // TODO check % is optimized when base is power of 2 + + if (mTasksFifo[tail_adjusted_idx].can_push_task()) // Either idle or pending_result are valid + { + task_slot = &mTasksFifo[tail_adjusted_idx]; + return task_slot->has_result(); + } + } + task_slot = nullptr; + return false; +} + +template +void TasksManager::Worker::get_completed_task(Task*& task_slot) { + for (int i = 0; i < mTasksFifo.size(); i++) + { + int tail_adjusted_idx = (i + tail) & TASK_IDX_MASK; + + if (mTasksFifo[tail_adjusted_idx].has_result()) + { + task_slot = &mTasksFifo[tail_adjusted_idx]; + return; + } + } + task_slot = nullptr; +} + +template +bool TasksManager::Worker::are_all_idle() +{ + bool all_tasks_idle = true; + for (Task& task : mTasksFifo) all_tasks_idle = all_tasks_idle && task.is_idle(); + return all_tasks_idle; +} + +template +TasksManager::TasksManager(const int nof_workers) +: workers(nof_workers) +{ + // Link tasks with their fifo idx + for (Worker& worker : workers) + { + for (Task& task : worker.mTasksFifo) + { + task.father_fifo_tail = &worker.tail; + } + } +} + +template +bool TasksManager::get_free_task(Task*& task_slot) { + bool has_task = false; + do + { + for (Worker& worker : workers) + { + has_task = worker.get_free_task(task_slot); + if (task_slot != nullptr) break; + } + } while (task_slot == nullptr); + if (has_task) task_slot->set_handled_result(); + return has_task; +} + +template +void TasksManager::get_completed_task(Task*& completed_task) { + completed_task = nullptr; + bool all_tasks_idle = true; + do + { + all_tasks_idle = true; + for (Worker& worker : workers) + { + worker.get_completed_task(completed_task); + if (completed_task != nullptr) + { + completed_task->set_handled_result(); + return; + } + + all_tasks_idle = all_tasks_idle && worker.are_all_idle(); + } + } while (!all_tasks_idle); + // No with_taskd tasks were found in the loop - no complete tasks left to be handled + completed_task = nullptr; +} \ No newline at end of file diff --git a/icicle_v3/backend/cpu/src/curve/tasks_manager.hpp b/icicle_v3/backend/cpu/src/curve/tasks_manager.hpp new file mode 100644 index 00000000..56f2e4be --- /dev/null +++ b/icicle_v3/backend/cpu/src/curve/tasks_manager.hpp @@ -0,0 +1,66 @@ +#ifndef TASKS_MANAGER +#define TASKS_MANAGER +#include +#include // TODO check windows support + +template +class TasksManager { +public: + // constructor + TasksManager(const int nof_workers); + + // Get a task that isn't occupied + // return value signals if there is a valid result to be handled before dispatching a new calculation + // Blocking functions + bool get_free_task(Task*& task_slot); + void get_completed_task(Task*& completed_task); // NOTE the user must handle this task's result before asking for new free tasks +private: + class Worker { + public: + Worker(); + ~Worker(); + void run(); + // Get a task that isn't occupied + // return value signals if there is a valid result to be handled before dispatching a new calculation + bool get_free_task(Task*& task_slot); + void get_completed_task(Task*& completed_task); // NOTE the user must handle this task's result before asking for new free tasks + bool are_all_idle(); + private: + std::thread task_executor; + std::vector mTasksFifo; + int tail; + int head; + bool kill; + + friend TasksManager::TasksManager(const int nof_workers); + }; + + std::vector workers; +}; + +class TaskBase { + public: + TaskBase() : status(idle), father_fifo_tail(nullptr) {} + virtual void execute() = 0; // COMMENT should it be private? still needs to be friend of worker + void dispatch(); // USER FUNC + + inline bool is_ready_for_work(); // TODO friend functions of worker + inline bool can_push_task(); + inline bool has_result(); + inline bool is_idle(); + inline void set_working(); + inline void set_pending_result(); + inline void set_handled_result(); + // Blocking function + void wait_done(); // NOTE the user must handle this task's result before asking for new free tasks + + protected: + enum TaskStatus {idle, set_task, dispatched, working, pending_result}; // TODO CAPS and eTaskStatus + std::atomic status; + private: + int* father_fifo_tail; + + template + friend TasksManager::TasksManager(const int nof_workers); +}; +#endif \ No newline at end of file diff --git a/icicle_v3/tests/test_curve_api.cpp b/icicle_v3/tests/test_curve_api.cpp index c679e198..59da3866 100644 --- a/icicle_v3/tests/test_curve_api.cpp +++ b/icicle_v3/tests/test_curve_api.cpp @@ -77,131 +77,43 @@ bool read_inputs(T* arr, const int arr_size, const std::string fname) return status; } -template -void store_inputs(T* arr, const int arr_size, const std::string fname) -{ - std::ofstream out_file(fname); - if (!out_file.is_open()) { - std::cerr << "Failed to open " << fname << " for writing.\n"; - return; + template + void store_inputs(T* arr, const int arr_size, const std::string fname) + { + std::ofstream out_file(fname); + if (!out_file.is_open()) { + std::cerr << "Failed to open " << fname << " for writing.\n"; + return; + } + for (int i = 0; i < arr_size; i++) { + out_file.write(reinterpret_cast(&arr[i]), sizeof(T)); + } + out_file.close(); } - for (int i = 0; i < arr_size; i++) { - out_file.write(reinterpret_cast(&arr[i]), sizeof(T)); + + void get_inputs(affine_t* bases, scalar_t* scalars, const int n) // TODO add precompute factor + { + // Scalars + std::string scalar_file = "build/generated_data/scalars_N" + std::to_string(n) + ".dat"; + if (!read_inputs(scalars, n, scalar_file)) { + std::cout << "Generating scalars.\n"; + scalar_t::rand_host_many(scalars, n); + store_inputs(scalars, n, scalar_file); + } + // Bases + std::string base_file = "build/generated_data/bases_N" + std::to_string(n) + ".dat"; + if (!read_inputs(bases, n, base_file)) { + std::cout << "Generating bases.\n"; + projective_t::rand_host_many(bases, n); + store_inputs(bases, n, base_file); + } } - out_file.close(); -} -void get_inputs(affine_t* bases, scalar_t* scalars, const int n) // TODO add precompute factor -{ - // Scalars - std::string scalar_file = "build/generated_data/scalars_N" + std::to_string(n) + ".dat"; - if (!read_inputs(scalars, n, scalar_file)) { - std::cout << "Generating scalars.\n"; - scalar_t::rand_host_many(scalars, n); - store_inputs(scalars, n, scalar_file); - } - // Bases - std::string base_file = "build/generated_data/bases_N" + std::to_string(n) + ".dat"; - if (!read_inputs(bases, n, base_file)) { - std::cout << "Generating bases.\n"; - projective_t::rand_host_many(bases, n); - store_inputs(bases, n, base_file); - } -} - -// TEST_F(CurveApiTest, MSM) -// { -// const int logn = 12; -// const int N = 1 << logn; -// auto scalars = std::make_unique(N); -// auto bases = std::make_unique(N); - -// bool conv_mont = false; -// get_inputs(bases.get(), scalars.get(), N); - -// if (conv_mont) { -// for (int i = 0; i < N; i++) -// bases[i] = affine_t::to_montgomery(bases[i]); -// } -// projective_t result_cpu{}; -// projective_t result_cpu_dbl_n_add{}; -// projective_t result_cpu_ref{}; - -// projective_t result{}; - -// auto run = [&](const char* dev_type, projective_t* result, const char* msg, bool measure, int iters) { -// Device dev = {dev_type, 0}; -// icicle_set_device(dev); - -// const int log_p = 2; -// const int c = std::max(logn, 8) - 1; -// const int pcf = 1 << log_p; - -// const int n_threads = 8; -// const int tasks_per_thread = 2; - -// auto config = default_msm_config(); -// config.ext.set("c", c); -// config.ext.set("n_threads", n_threads); -// config.ext.set("tasks_per_thread", tasks_per_thread); -// config.precompute_factor = pcf; -// config.are_scalars_montgomery_form = false; -// config.are_points_montgomery_form = conv_mont; - -// auto pc_config = default_msm_pre_compute_config(); -// pc_config.ext.set("c", c); -// pc_config.ext.set("is_mont", config.are_points_montgomery_form); - -// auto precomp_bases = std::make_unique(N * pcf); -// // TODO update cmake to include directory? -// std::string precomp_fname = -// "build/generated_data/precomp_N" + std::to_string(N) + "_pcf" + std::to_string(pcf) + ".dat"; -// if (!read_inputs(precomp_bases.get(), N * pcf, precomp_fname)) { -// std::cout << "Precomputing bases." << '\n'; -// msm_precompute_bases(bases.get(), N, pcf, pc_config, precomp_bases.get()); -// store_inputs(precomp_bases.get(), N * pcf, precomp_fname); -// } - -// // int test_size = 10000; -// // std::cout << "NUm additions:\t" << test_size << '\n'; -// // scalar_t* a = new scalar_t[test_size]; -// // scalar_t* b = new scalar_t[test_size]; -// // scalar_t* apb = new scalar_t[test_size]; -// // { -// // scalar_t* bases_p = scalars.get(); -// // for (int i = 0; i < test_size; i++) -// // { -// // a[i] = bases_p[i]; -// // b[i] = bases_p[i + test_size]; -// // apb[i] = scalar_t::zero(); -// // } -// // } - -// START_TIMER(MSM_sync) -// for (int i = 0; i < iters; ++i) { -// msm(scalars.get(), precomp_bases.get(), N, config, result); -// } -// // for (int i = 0; i < test_size; i++) -// // { -// // apb[i] = a[i] * b[i]; -// // } -// END_TIMER(MSM_sync, msg, measure); -// }; - -// // run("CPU", &result_cpu_dbl_n_add, "CPU msm", false /*=measure*/, 1 /*=iters*/); // warmup -// int iters = 1; -// run("CPU", &result_cpu, "CPU msm", VERBOSE /*=measure*/, iters /*=iters*/); -// run("CPU_REF", &result_cpu_ref, "CPU_REF msm", VERBOSE /*=measure*/, iters /*=iters*/); - -// std::cout << projective_t::to_affine(result_cpu) << std::endl; -// std::cout << projective_t::to_affine(result_cpu_ref) << std::endl; -// ASSERT_EQ(result_cpu, result_cpu_ref); -// } template void MSM_test() { - const int logn = 8; + const int logn = 17; const int batch = 1; // TODO test batch const int N = 1 << logn; const int precompute_factor = 4; @@ -210,9 +122,11 @@ void get_inputs(affine_t* bases, scalar_t* scalars, const int n) // TODO add pre const int total_nof_elemets = batch * N; auto scalars = std::make_unique(total_nof_elemets); auto bases = std::make_unique(N); + std::cout << "Starting MSM\n"; - scalar_t::rand_host_many(scalars.get(), total_nof_elemets); - P::rand_host_many(bases.get(), N); + // scalar_t::rand_host_many(scalars.get(), total_nof_elemets); + // P::rand_host_many(bases.get(), N); + get_inputs(bases.get(), scalars.get(), N); auto result_main = std::make_unique(batch); auto result_ref = std::make_unique(batch); @@ -361,6 +275,7 @@ TYPED_TEST(CurveSanity, CurveSanityTest) int main(int argc, char** argv) { + std::cout << "Start\n\n"; ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } \ No newline at end of file