mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-09 15:37:58 -05:00
added framework for generic small tasks thread pool that has yet to be verified against another model (i.e. gpu)
This commit is contained in:
@@ -268,86 +268,6 @@ eIcicleError cpu_msm_single_thread(
|
||||
return eIcicleError::SUCCESS;
|
||||
}
|
||||
|
||||
template <typename Point>
|
||||
ThreadTask<Point>::ThreadTask() : return_idx(-1), p1(Point::zero()), p2(Point::zero()), result(Point::zero())
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Point>
|
||||
ThreadTask<Point>::ThreadTask(
|
||||
const ThreadTask<Point>& other) // TODO delete when changing to task array instead of vector
|
||||
: return_idx(-1), p1(Point::zero()), p2(Point::zero()), result(Point::zero())
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Point>
|
||||
void ThreadTask<Point>::run(int tid, std::vector<int>& idle_idxs, bool& kill_thread)
|
||||
{
|
||||
bool rdy_status = in_ready.load(std::memory_order_acquire);
|
||||
if (!rdy_status) idle_idxs.push_back(pidx); // TODO remove when finishing debugging
|
||||
while (!rdy_status) {
|
||||
std::this_thread::sleep_for(std::chrono::microseconds(10));
|
||||
rdy_status = in_ready.load(std::memory_order_acquire);
|
||||
if (kill_thread) return;
|
||||
}
|
||||
in_ready.store(false, std::memory_order_release);
|
||||
result = p1 + p2;
|
||||
out_done.store(true, std::memory_order_release);
|
||||
}
|
||||
|
||||
template <typename Point>
|
||||
void ThreadTask<Point>::new_task(const int in_idx, const Point& in_p1, const Point& in_p2)
|
||||
{
|
||||
out_done.store(false, std::memory_order_release);
|
||||
return_idx = in_idx;
|
||||
p1 = in_p1; // Copied by value to not be linked to the original bucket
|
||||
p2 = in_p2;
|
||||
in_ready.store(true, std::memory_order_release);
|
||||
}
|
||||
|
||||
template <typename Point>
|
||||
void ThreadTask<Point>::chain_task(const Point in_p2)
|
||||
{
|
||||
// std::unique_lock<std::mutex> temp_lock(idle_mtx);
|
||||
out_done.store(false, std::memory_order_release);
|
||||
p2 = in_p2;
|
||||
p1 = result;
|
||||
in_ready.store(true, std::memory_order_release);
|
||||
// idle.notify_one();
|
||||
}
|
||||
|
||||
template <typename Point>
|
||||
WorkThread<Point>::~WorkThread()
|
||||
{
|
||||
kill_thread = true;
|
||||
thread.join();
|
||||
}
|
||||
|
||||
template <typename Point>
|
||||
void WorkThread<Point>::thread_setup(const int tid, const int task_per_thread)
|
||||
{
|
||||
this->tid = tid;
|
||||
for (int j = 0; j < task_per_thread; j++)
|
||||
tasks.push_back(ThreadTask<Point>()); // TODO change to array
|
||||
thread = std::thread(
|
||||
&WorkThread<Point>::add_ec_tasks, this, std::ref(kill_thread)); // TODO kill_thread is accessible from this
|
||||
}
|
||||
|
||||
template <typename Point>
|
||||
void WorkThread<Point>::add_ec_tasks(bool& kill_thread)
|
||||
{
|
||||
while (!kill_thread) {
|
||||
int i = 0;
|
||||
for (ThreadTask<Point>& task : tasks) {
|
||||
task.run(tid, idle_idxs, kill_thread);
|
||||
#ifdef DEBUG_PRINTS
|
||||
if (tid == 2) std::cout << i << ", bkt_idx=" << tasks[task_round_robin].return_idx << "\tDone\n";
|
||||
i++;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Point>
|
||||
Point int_mult(Point p, int x)
|
||||
{
|
||||
@@ -370,14 +290,13 @@ Msm<Point>::Msm(const MSMConfig& config)
|
||||
c(config.ext->get<int>("c")), // TODO calculate instead of param
|
||||
precomp_f(config.precompute_factor), num_bms(((sca_test::NBITS - 1) / (config.precompute_factor * c)) + 1),
|
||||
are_scalars_mont(config.are_scalars_montgomery_form), are_points_mont(config.are_points_montgomery_form),
|
||||
kill_threads(false),
|
||||
manager(config.ext->get<int>("n_threads")),
|
||||
#ifdef DEBUG_PRINTS
|
||||
trace_f("trace_bucket_multi.txt"), // TODO delete
|
||||
#endif
|
||||
thread_round_robin(0), num_bkts(1 << (c - 1)),
|
||||
log_num_segments(std::max((int)std::ceil(std::log2(n_threads / num_bms)), 0)),
|
||||
num_bm_segments(1 << log_num_segments), segment_size(num_bkts >> log_num_segments),
|
||||
tasks_per_thread(std::max(((int)(num_bms / n_threads)) << (log_num_segments + 1), 2))
|
||||
num_bkts(1 << (c - 1)),
|
||||
log_num_segments(std::max((int)std::ceil(std::log2(n_threads * TASKS_PER_THREAD / (2 * num_bms))), 0)),
|
||||
num_bm_segments(1 << log_num_segments), segment_size(num_bkts >> log_num_segments)
|
||||
{
|
||||
// Phase 1
|
||||
bkts = new Point[num_bms * num_bkts];
|
||||
@@ -395,9 +314,9 @@ Msm<Point>::Msm(const MSMConfig& config)
|
||||
|
||||
bm_sums = new Point[num_bms];
|
||||
|
||||
threads = new WorkThread<Point>[n_threads];
|
||||
for (int i = 0; i < n_threads; i++)
|
||||
threads[i].thread_setup(i, tasks_per_thread);
|
||||
// threads = new WorkThread<Point>[n_threads];
|
||||
// for (int i = 0; i < n_threads; i++)
|
||||
// threads[i].thread_setup(i, tasks_per_thread);
|
||||
|
||||
#ifdef DEBUG_PRINTS
|
||||
if (!trace_f.good()) { throw std::invalid_argument("Can't open file"); } // TODO remove log
|
||||
@@ -410,12 +329,11 @@ Msm<Point>::~Msm()
|
||||
std::cout << "\n\nDestroying msm object at the end of the run\n\n"; // COMMENT why am I not seeing it one the console?
|
||||
// Isn't the destructor automatically called when
|
||||
// msm goes out of scope?
|
||||
kill_threads = true; // TODO check if it is used after kill_thread has been added to WorkThread destructor
|
||||
// for (int i = 0; i < n_threads; i++) threads[i].thread.join();
|
||||
#ifdef DEBUG_PRINTS
|
||||
trace_f.close();
|
||||
#endif
|
||||
delete[] threads;
|
||||
// delete[] threads;
|
||||
delete[] bkts;
|
||||
delete[] bkts_occupancy;
|
||||
|
||||
@@ -445,82 +363,117 @@ void Msm<Point>::wait_for_idle()
|
||||
* @param bkts_occupancy - boolean array holding the bkts_occupancy of each bucket
|
||||
* @return boolean, a flag indicating all threads are idle
|
||||
*/
|
||||
bool all_threads_idle = false;
|
||||
ECaddTask<Point, Point>* task = nullptr;
|
||||
manager.get_completed_task(task);
|
||||
int count = 0;
|
||||
while (!all_threads_idle) {
|
||||
all_threads_idle = true;
|
||||
|
||||
for (int i = 0; i < n_threads; i++) {
|
||||
int og_task_round_robin = threads[i].task_round_robin;
|
||||
|
||||
while (task != nullptr)
|
||||
{
|
||||
// std::cout << count << ":\tNew completed task.\tstatus=" << task->status << " (Idle = " << task->is_idle() << ")" << "\n";
|
||||
count++;
|
||||
if (bkts_occupancy[task->return_idx])
|
||||
{
|
||||
task->p1 = task->result;
|
||||
task->p2 = bkts[task->return_idx];
|
||||
bkts_occupancy[task->return_idx] = false;
|
||||
#ifdef DEBUG_PRINTS
|
||||
trace_f << "Finishing thread " << i << ", starting at task: " << og_task_round_robin << '\n';
|
||||
trace_f << '#' << task->return_idx << ":\tFCollision addition - bkts' cell:\t"
|
||||
<< Point::to_affine(bkts[task->return_idx]).x << "\t(With add res point:\t"
|
||||
<< Point::to_affine(task->result).x << " = " << Point::to_affine(bkts[task->return_idx] + task->result).x
|
||||
<< ")\t(" << task << ")\n";
|
||||
#endif
|
||||
|
||||
for (int j = og_task_round_robin; j < og_task_round_robin + tasks_per_thread; j++) {
|
||||
int task_idx = j;
|
||||
if (task_idx >= tasks_per_thread) task_idx -= tasks_per_thread;
|
||||
|
||||
// For readability
|
||||
ThreadTask<Point>& task = threads[i].tasks[task_idx];
|
||||
if (task.out_done.load(std::memory_order_acquire)) {
|
||||
if (task.return_idx >= 0) {
|
||||
if (bkts_occupancy[task.return_idx]) {
|
||||
#ifdef DEBUG_PRINTS
|
||||
trace_f << '#' << task.return_idx << ":\tFCollision addition - bkts' cell:\t"
|
||||
<< Point::to_affine(bkts[task.return_idx]).x << "\t(With add res point:\t"
|
||||
<< Point::to_affine(task.result).x << " = "
|
||||
<< Point::to_affine(bkts[task.return_idx] + task.result).x << ")\t(" << i << ',' << task_idx
|
||||
<< ")\n";
|
||||
#endif
|
||||
// std::cout << "\n" << i << ":\t(" << task_idx << "->" << threads[i].task_round_robin <<
|
||||
// ")\tChaining\n\n";
|
||||
bkts_occupancy[task.return_idx] = false;
|
||||
int bkt_idx = task.return_idx;
|
||||
task.return_idx = -1;
|
||||
|
||||
threads[i].tasks[threads[i].task_round_robin].new_task(bkt_idx, task.result, bkts[bkt_idx]);
|
||||
if (threads[i].task_round_robin == tasks_per_thread - 1)
|
||||
threads[i].task_round_robin = 0;
|
||||
else
|
||||
threads[i].task_round_robin++;
|
||||
|
||||
all_threads_idle = false; // This thread isn't idle due to the newly assigned task
|
||||
} else {
|
||||
bkts[task.return_idx] = task.result;
|
||||
#ifdef DEBUG_PRINTS
|
||||
trace_f << '#' << task.return_idx << ":\tFWrite (res) free cell:\t"
|
||||
<< Point::to_affine(bkts[task.return_idx]).x << "\t(" << i << ',' << task_idx << ")\n";
|
||||
#endif
|
||||
bkts_occupancy[task.return_idx] = true;
|
||||
task.return_idx = -1; // To ensure no repeated handling of outputs
|
||||
}
|
||||
} else {
|
||||
#ifdef DEBUG_PRINTS
|
||||
trace_f << "Task " << task_idx << " idle\n";
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
all_threads_idle = false;
|
||||
break;
|
||||
#ifdef DEBUG_PRINTS
|
||||
trace_f << '#' << task.return_idx << ":\t(" << i << ',' << task_idx << ") not done\tres=" << task.result
|
||||
<< "\tstatus:" << task.in_ready << ',' << task.out_done << '\n';
|
||||
#endif
|
||||
}
|
||||
}
|
||||
task->dispatch();
|
||||
}
|
||||
// std::this_thread::sleep_for(std::chrono::milliseconds(5000));
|
||||
else bkts[task->return_idx] = task->result;
|
||||
manager.get_completed_task(task);
|
||||
}
|
||||
// trace_f.flush();
|
||||
}
|
||||
|
||||
// template <typename Point>
|
||||
// void Msm<Point>::old_wait_for_idle()
|
||||
// {
|
||||
// /**
|
||||
// * Handle thread outputs and possible collisions between results and occupied bkts
|
||||
// * @param threads - working threads array
|
||||
// * @param n_threads - size of the above array
|
||||
// * @param bkts - bucket array to store results / add stored value with result
|
||||
// * @param bkts_occupancy - boolean array holding the bkts_occupancy of each bucket
|
||||
// * @return boolean, a flag indicating all threads are idle
|
||||
// */
|
||||
// bool all_threads_idle = false;
|
||||
// int count = 0;
|
||||
// while (!all_threads_idle) {
|
||||
// all_threads_idle = true;
|
||||
|
||||
// for (int i = 0; i < n_threads; i++) {
|
||||
// int og_task_round_robin = threads[i].task_round_robin;
|
||||
|
||||
// #ifdef DEBUG_PRINTS
|
||||
// trace_f << "Finishing thread " << i << ", starting at task: " << og_task_round_robin << '\n';
|
||||
// #endif
|
||||
|
||||
// for (int j = og_task_round_robin; j < og_task_round_robin + tasks_per_thread; j++) {
|
||||
// int task_idx = j;
|
||||
// if (task_idx >= tasks_per_thread) task_idx -= tasks_per_thread;
|
||||
|
||||
// // For readability
|
||||
// ThreadTask<Point>& task = threads[i].tasks[task_idx];
|
||||
// if (task.out_done.load(std::memory_order_acquire)) {
|
||||
// if (task.return_idx >= 0) {
|
||||
// if (bkts_occupancy[task.return_idx]) {
|
||||
// #ifdef DEBUG_PRINTS
|
||||
// trace_f << '#' << task.return_idx << ":\tFCollision addition - bkts' cell:\t"
|
||||
// << Point::to_affine(bkts[task.return_idx]).x << "\t(With add res point:\t"
|
||||
// << Point::to_affine(task.result).x << " = "
|
||||
// << Point::to_affine(bkts[task.return_idx] + task.result).x << ")\t(" << i << ',' << task_idx
|
||||
// << ")\n";
|
||||
// #endif
|
||||
// // std::cout << "\n" << i << ":\t(" << task_idx << "->" << threads[i].task_round_robin <<
|
||||
// // ")\tChaining\n\n";
|
||||
// bkts_occupancy[task.return_idx] = false;
|
||||
// int bkt_idx = task.return_idx;
|
||||
// task.return_idx = -1;
|
||||
|
||||
// threads[i].tasks[threads[i].task_round_robin].new_task(bkt_idx, task.result, bkts[bkt_idx]);
|
||||
// if (threads[i].task_round_robin == tasks_per_thread - 1)
|
||||
// threads[i].task_round_robin = 0;
|
||||
// else
|
||||
// threads[i].task_round_robin++;
|
||||
|
||||
// all_threads_idle = false; // This thread isn't idle due to the newly assigned task
|
||||
// } else {
|
||||
// bkts[task.return_idx] = task.result;
|
||||
// #ifdef DEBUG_PRINTS
|
||||
// trace_f << '#' << task.return_idx << ":\tFWrite (res) free cell:\t"
|
||||
// << Point::to_affine(bkts[task.return_idx]).x << "\t(" << i << ',' << task_idx << ")\n";
|
||||
// #endif
|
||||
// bkts_occupancy[task.return_idx] = true;
|
||||
// task.return_idx = -1; // To ensure no repeated handling of outputs
|
||||
// }
|
||||
// } else {
|
||||
// #ifdef DEBUG_PRINTS
|
||||
// trace_f << "Task " << task_idx << " idle\n";
|
||||
// #endif
|
||||
// }
|
||||
// } else {
|
||||
// all_threads_idle = false;
|
||||
// break;
|
||||
// #ifdef DEBUG_PRINTS
|
||||
// trace_f << '#' << task.return_idx << ":\t(" << i << ',' << task_idx << ") not done\tres=" << task.result
|
||||
// << "\tstatus:" << task.in_ready << ',' << task.out_done << '\n';
|
||||
// #endif
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// // std::this_thread::sleep_for(std::chrono::milliseconds(5000));
|
||||
// }
|
||||
// // trace_f.flush();
|
||||
// }
|
||||
|
||||
template <typename Point>
|
||||
template <typename Base>
|
||||
void Msm<Point>::phase1_push_addition(
|
||||
const unsigned int task_bkt_idx,
|
||||
const Point bkt,
|
||||
const Base& base,
|
||||
const Point& base,
|
||||
int pidx) // TODO add option of adding different types
|
||||
{
|
||||
/**
|
||||
@@ -530,72 +483,120 @@ void Msm<Point>::phase1_push_addition(
|
||||
* an overwritten without affecting the working thread
|
||||
* @param p2 - point to be added
|
||||
*/
|
||||
int count_thread_iters = 0;
|
||||
bool assigned_to_thread = false;
|
||||
while (!assigned_to_thread) {
|
||||
// For readability
|
||||
ThreadTask<Point>& task = threads[thread_round_robin].tasks[threads[thread_round_robin].task_round_robin];
|
||||
if (task.out_done.load(std::memory_order_acquire)) {
|
||||
num_additions++;
|
||||
if (task.return_idx >= 0) {
|
||||
if (bkts_occupancy[task.return_idx]) {
|
||||
ECaddTask<Point, Point>* task = nullptr; // TODO actually use the 2 types according to the phase and remove templates
|
||||
while (task == nullptr)
|
||||
{
|
||||
bool holds_result = manager.get_free_task(task);
|
||||
if (holds_result)
|
||||
{
|
||||
if (bkts_occupancy[task->return_idx])
|
||||
{
|
||||
task->p1 = task->result;
|
||||
task->p2 = bkts[task->return_idx];
|
||||
bkts_occupancy[task->return_idx] = false;
|
||||
#ifdef DEBUG_PRINTS
|
||||
trace_f << '#' << task.return_idx << ":\tCollision addition - bkts' cell:\t"
|
||||
<< Point::to_affine(bkts[task.return_idx]).x << "\t(With add res point:\t"
|
||||
<< Point::to_affine(task.result).x << " = " << Point::to_affine(bkts[task.return_idx] + task.result).x
|
||||
<< ")\t(" << thread_round_robin << ',' << threads[thread_round_robin].task_round_robin << ")\n";
|
||||
trace_f << '#' << task->return_idx << ":\tCollision addition - bkts' cell:\t"
|
||||
<< Point::to_affine(bkts[task->return_idx]).x << "\t(With add res point:\t"
|
||||
<< Point::to_affine(task->result).x << " = " << Point::to_affine(bkts[task->return_idx] + task->result).x
|
||||
<< ")\t(" << task << ")\n";
|
||||
#endif
|
||||
bkts_occupancy[task.return_idx] = false;
|
||||
;
|
||||
task.pidx = pidx;
|
||||
task.chain_task(bkts[task.return_idx]);
|
||||
} else {
|
||||
#ifdef DEBUG_PRINTS
|
||||
trace_f << '#' << task.return_idx << ":\tWrite (res) free cell:\t" << Point::to_affine(task.result).x << "\t("
|
||||
<< thread_round_robin << ',' << threads[thread_round_robin].task_round_robin << ")\n";
|
||||
#endif
|
||||
bkts[task.return_idx] = task.result;
|
||||
bkts_occupancy[task.return_idx] = true;
|
||||
task.pidx = pidx;
|
||||
task.new_task(task_bkt_idx, bkt, Point::from_affine(base)); // TODO support multiple types
|
||||
assigned_to_thread = true;
|
||||
#ifdef DEBUG_PRINTS
|
||||
trace_f << '#' << task_bkt_idx << ":\tAssigned to:\t(" << thread_round_robin << ','
|
||||
<< threads[thread_round_robin].task_round_robin << ")\n";
|
||||
// trace_f.flush();
|
||||
#endif
|
||||
// break;
|
||||
}
|
||||
} else {
|
||||
task.pidx = pidx;
|
||||
task.new_task(task_bkt_idx, bkt, Point::from_affine(base)); // TODO support multiple types
|
||||
assigned_to_thread = true;
|
||||
#ifdef DEBUG_PRINTS
|
||||
trace_f << '#' << task_bkt_idx << ":\tAssigned to:\t(" << thread_round_robin << ','
|
||||
<< threads[thread_round_robin].task_round_robin << ")\n";
|
||||
// trace_f.flush();
|
||||
#endif
|
||||
// break;
|
||||
task->dispatch();
|
||||
task = nullptr;
|
||||
continue;
|
||||
}
|
||||
if (threads[thread_round_robin].task_round_robin == tasks_per_thread - 1)
|
||||
threads[thread_round_robin].task_round_robin = 0;
|
||||
else
|
||||
threads[thread_round_robin].task_round_robin++;
|
||||
}
|
||||
|
||||
// Move to next thread after checking all current thread's tasks
|
||||
if (thread_round_robin == n_threads - 1)
|
||||
thread_round_robin = 0;
|
||||
else
|
||||
thread_round_robin++;
|
||||
count_thread_iters++;
|
||||
if (count_thread_iters == n_threads) {
|
||||
count_thread_iters = 0;
|
||||
loop_count++;
|
||||
else bkts[task->return_idx] = task->result;
|
||||
}
|
||||
task->p1 = bkt;
|
||||
task->p2 = base;
|
||||
task->return_idx = task_bkt_idx;
|
||||
#ifdef DEBUG_PRINTS
|
||||
trace_f << '#' << task_bkt_idx << ":\tAssigned to:\t(" << task << ")\n";
|
||||
#endif
|
||||
task->dispatch();
|
||||
}
|
||||
}
|
||||
|
||||
// template <typename Point>
|
||||
// template <typename Base>
|
||||
// void Msm<Point>::old_phase1_push_addition(
|
||||
// const unsigned int task_bkt_idx,
|
||||
// const Point bkt,
|
||||
// const Base& base,
|
||||
// int pidx) // TODO add option of adding different types
|
||||
// {
|
||||
// /**
|
||||
// * Assign EC addition to a free thread
|
||||
// * @param task_bkt_idx - results address in the bkts array
|
||||
// * @param bkt - bkt to be added. it is passed by value to allow the appropriate cell in the bucket array to be "free"
|
||||
// * an overwritten without affecting the working thread
|
||||
// * @param p2 - point to be added
|
||||
// */
|
||||
// int count_thread_iters = 0;
|
||||
// bool assigned_to_thread = false;
|
||||
// while (!assigned_to_thread) {
|
||||
// // For readability
|
||||
// ThreadTask<Point>& task = threads[thread_round_robin].tasks[threads[thread_round_robin].task_round_robin];
|
||||
// if (task.out_done.load(std::memory_order_acquire)) {
|
||||
// num_additions++;
|
||||
// if (task.return_idx >= 0) {
|
||||
// if (bkts_occupancy[task.return_idx]) {
|
||||
// #ifdef DEBUG_PRINTS
|
||||
// trace_f << '#' << task.return_idx << ":\tCollision addition - bkts' cell:\t"
|
||||
// << Point::to_affine(bkts[task.return_idx]).x << "\t(With add res point:\t"
|
||||
// << Point::to_affine(task.result).x << " = " << Point::to_affine(bkts[task.return_idx] + task.result).x
|
||||
// << ")\t(" << thread_round_robin << ',' << threads[thread_round_robin].task_round_robin << ")\n";
|
||||
// #endif
|
||||
// bkts_occupancy[task.return_idx] = false;
|
||||
// ;
|
||||
// task.pidx = pidx;
|
||||
// task.chain_task(bkts[task.return_idx]);
|
||||
// } else {
|
||||
// #ifdef DEBUG_PRINTS
|
||||
// trace_f << '#' << task.return_idx << ":\tWrite (res) free cell:\t" << Point::to_affine(task.result).x << "\t("
|
||||
// << thread_round_robin << ',' << threads[thread_round_robin].task_round_robin << ")\n";
|
||||
// #endif
|
||||
// bkts[task.return_idx] = task.result;
|
||||
// bkts_occupancy[task.return_idx] = true;
|
||||
// task.pidx = pidx;
|
||||
// task.new_task(task_bkt_idx, bkt, Point::from_affine(base)); // TODO support multiple types
|
||||
// assigned_to_thread = true;
|
||||
// #ifdef DEBUG_PRINTS
|
||||
// trace_f << '#' << task_bkt_idx << ":\tAssigned to:\t(" << thread_round_robin << ','
|
||||
// << threads[thread_round_robin].task_round_robin << ")\n";
|
||||
// // trace_f.flush();
|
||||
// #endif
|
||||
// // break;
|
||||
// }
|
||||
// } else {
|
||||
// task.pidx = pidx;
|
||||
// task.new_task(task_bkt_idx, bkt, Point::from_affine(base)); // TODO support multiple types
|
||||
// assigned_to_thread = true;
|
||||
// #ifdef DEBUG_PRINTS
|
||||
// trace_f << '#' << task_bkt_idx << ":\tAssigned to:\t(" << thread_round_robin << ','
|
||||
// << threads[thread_round_robin].task_round_robin << ")\n";
|
||||
// // trace_f.flush();
|
||||
// #endif
|
||||
// // break;
|
||||
// }
|
||||
// if (threads[thread_round_robin].task_round_robin == tasks_per_thread - 1)
|
||||
// threads[thread_round_robin].task_round_robin = 0;
|
||||
// else
|
||||
// threads[thread_round_robin].task_round_robin++;
|
||||
// }
|
||||
|
||||
// // Move to next thread after checking all current thread's tasks
|
||||
// if (thread_round_robin == n_threads - 1)
|
||||
// thread_round_robin = 0;
|
||||
// else
|
||||
// thread_round_robin++;
|
||||
// count_thread_iters++;
|
||||
// if (count_thread_iters == n_threads) {
|
||||
// count_thread_iters = 0;
|
||||
// loop_count++;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
template <typename Point>
|
||||
Point* Msm<Point>::bucket_accumulator(const sca_test* scalars, const aff_test* bases, const unsigned int msm_size)
|
||||
{
|
||||
@@ -612,8 +613,7 @@ Point* Msm<Point>::bucket_accumulator(const sca_test* scalars, const aff_test* b
|
||||
const int num_windows_m1 = (sca_test::NBITS - 1) / c; // +1 for ceiling than -1 for m1
|
||||
int carry = 0;
|
||||
|
||||
std::cout << "\n\nc=" << c << "\tpcf=" << precomp_f << "\tnum bms=" << num_bms << "\tntrds,tasks=" << n_threads << ','
|
||||
<< tasks_per_thread << "\n\n\n";
|
||||
std::cout << "\n\nc=" << c << "\tpcf=" << precomp_f << "\tnum bms=" << num_bms << "\tntrds,tasks=" << n_threads << "\n\n\n";
|
||||
std::cout << log_num_segments << '\n' << segment_size << "\n\n\n";
|
||||
for (int i = 0; i < msm_size; i++) {
|
||||
carry = 0;
|
||||
@@ -645,7 +645,7 @@ Point* Msm<Point>::bucket_accumulator(const sca_test* scalars, const aff_test* b
|
||||
<< Point::to_affine(bkts[bkt_idx] + (carry > 0 ? aff_test::neg(point) : point)).x << ")\n";
|
||||
// trace_f.flush();
|
||||
#endif
|
||||
phase1_push_addition<aff_test>(bkt_idx, bkts[bkt_idx], carry > 0 ? aff_test::neg(point) : point, i);
|
||||
phase1_push_addition(bkt_idx, bkts[bkt_idx], carry > 0 ? Point::from_affine(aff_test::neg(point)) : Point::from_affine(point), i);
|
||||
} else {
|
||||
bkts_occupancy[bkt_idx] = true;
|
||||
bkts[bkt_idx] = carry > 0 ? Point::neg(Point::from_affine(point)) : Point::from_affine(point);
|
||||
@@ -667,12 +667,12 @@ Point* Msm<Point>::bucket_accumulator(const sca_test* scalars, const aff_test* b
|
||||
return bkts;
|
||||
}
|
||||
|
||||
template <typename Point>
|
||||
std::tuple<int, int>
|
||||
Msm<Point>::phase2_push_addition(const unsigned int task_bkt_idx, const Point& bkt, const Point& base)
|
||||
{
|
||||
return std::make_tuple(-1, -1);
|
||||
}
|
||||
// template <typename Point>
|
||||
// std::tuple<int, int>
|
||||
// Msm<Point>::phase2_push_addition(const unsigned int task_bkt_idx, const Point& bkt, const Point& base);
|
||||
// {
|
||||
// return std::make_tuple(-1, -1);
|
||||
// }
|
||||
|
||||
template <typename Point>
|
||||
Point* Msm<Point>::bm_sum()
|
||||
@@ -705,8 +705,8 @@ Point* Msm<Point>::bm_sum()
|
||||
int assigned_thread = std::get<0>(task_assigned_to_sum[line_sum_idx]);
|
||||
int assigned_task = std::get<1>(task_assigned_to_sum[line_sum_idx]);
|
||||
if (assigned_thread >= 0) {
|
||||
while (!threads[assigned_thread].task[assigned_task].out_done.load(std::memory_order_acquire)) {
|
||||
} // TODO add sleep
|
||||
// while (!threads[assigned_thread].task[assigned_task].out_done.load(std::memory_order_acquire)) {
|
||||
// } // TODO add sleep
|
||||
// task_assigned_to_sum[]
|
||||
}
|
||||
|
||||
@@ -766,8 +766,6 @@ eIcicleError cpu_msm_ref(
|
||||
sca_test scalar = config.are_scalars_montgomery_form ? sca_test::from_montgomery(scalars[i]) : scalars[i];
|
||||
aff_test point =
|
||||
config.are_points_montgomery_form ? aff_test::from_montgomery(bases[precomp_f * i]) : bases[precomp_f * i];
|
||||
|
||||
// std::cout << scalar << "\t*\t" << point << '\n';
|
||||
res = res + scalar * Point::from_affine(point);
|
||||
}
|
||||
// results[0] = config.are_points_montgomery_form? Point::to_montgomery(res) : res;
|
||||
@@ -789,22 +787,16 @@ eIcicleError cpu_msm_precompute_bases(
|
||||
const unsigned int c = config.ext->get<int>("c");
|
||||
const unsigned int num_bms_no_precomp = (sca_test::NBITS - 1) / c + 1;
|
||||
const unsigned int shift = c * ((num_bms_no_precomp - 1) / precompute_factor + 1);
|
||||
// std::cout << "Starting precompute\n";
|
||||
// std::cout << c << ',' << shift << '\n';
|
||||
for (int i = 0; i < nof_bases; i++) {
|
||||
output_bases[precompute_factor * i] = input_bases[i]; // COMMENT Should I copy? (not by reference)
|
||||
proj_test point = proj_test::from_affine(is_mont ? A::from_montgomery(input_bases[i]) : input_bases[i]);
|
||||
// std::cout << "OG point[" << i << "]:\t" << point << '\n';
|
||||
for (int j = 1; j < precompute_factor; j++) {
|
||||
for (int k = 0; k < shift; k++) {
|
||||
point = proj_test::dbl(point);
|
||||
// std::cout << point << '\n';
|
||||
}
|
||||
// std::cout << "Shifted point[" << i << "]:\t" << point << "\nStored in index=" << (precompute_factor*i+j) <<
|
||||
// '\n';
|
||||
output_bases[precompute_factor * i + j] = is_mont
|
||||
? A::to_montgomery(proj_test::to_affine(point))
|
||||
: proj_test::to_affine(point); // TODO change here order of precomp
|
||||
: proj_test::to_affine(point);
|
||||
}
|
||||
}
|
||||
return eIcicleError::SUCCESS;
|
||||
|
||||
@@ -18,6 +18,8 @@ using namespace icicle;
|
||||
#include "icicle/curves/projective.h"
|
||||
#include "icicle/curves/curve_config.h"
|
||||
|
||||
#include "tasks_manager.cpp"
|
||||
|
||||
using aff_test = affine_t;
|
||||
using proj_test = projective_t;
|
||||
using sca_test = scalar_t;
|
||||
@@ -25,164 +27,7 @@ using sca_test = scalar_t;
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
|
||||
struct MSMConfig {
|
||||
int nof_bases; /**< Number of bases in the MSM for batched MSM. Set to 0 if all MSMs use the same bases or set to
|
||||
* 'batch X #scalars' otherwise. Default value: 0 (that is reuse bases for all batch elements). */
|
||||
int precompute_factor; /**< The number of extra points to pre-compute for each point. See the
|
||||
* [precompute_msm_bases](@ref precompute_msm_bases) function, `precompute_factor` passed
|
||||
* there needs to be equal to the one used here. Larger values decrease the
|
||||
* number of computations to make, on-line memory footprint, but increase the static
|
||||
* memory footprint. Default value: 1 (i.e. don't pre-compute). */
|
||||
int batch_size; /**< The number of MSMs to compute. Default value: 1. */
|
||||
bool are_scalars_on_device; /**< True if scalars are on device and false if they're on host. Default value:
|
||||
* false. */
|
||||
bool are_scalars_montgomery_form; /**< True if scalars are in Montgomery form and false otherwise. Default value:
|
||||
* true. */
|
||||
bool are_points_on_device; /**< True if points are on device and false if they're on host. Default value: false. */
|
||||
bool are_points_montgomery_form; /**< True if coordinates of points are in Montgomery form and false otherwise.
|
||||
* Default value: true. */
|
||||
bool are_results_on_device; /**< True if the results should be on device and false if they should be on host. If set
|
||||
* to false, `is_async` won't take effect because a synchronization is needed to
|
||||
* transfer results to the host. Default value: false. */
|
||||
bool is_async; /**< Whether to run the MSM asynchronously. If set to true, the MSM function will be
|
||||
* non-blocking and you'd need to synchronize it explicitly by running
|
||||
* `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the MSM
|
||||
* function will block the current CPU thread. */
|
||||
|
||||
ConfigExtension ext; /** backend specific extensions*/
|
||||
};
|
||||
|
||||
struct MsmPreComputeConfig {
|
||||
bool is_input_on_device;
|
||||
bool is_output_on_device;
|
||||
bool is_async;
|
||||
|
||||
ConfigExtension ext; /** backend specific extensions*/
|
||||
};
|
||||
|
||||
#define P_MACRO 1000
|
||||
|
||||
struct Device {
|
||||
};
|
||||
|
||||
class Dummy_Scalar
|
||||
{
|
||||
public:
|
||||
static constexpr unsigned NBITS = 32;
|
||||
|
||||
unsigned x;
|
||||
unsigned p = P_MACRO;
|
||||
// unsigned p = 1<<30;
|
||||
|
||||
static Dummy_Scalar zero() { return {0}; }
|
||||
|
||||
static Dummy_Scalar one() { return {1}; }
|
||||
|
||||
static Dummy_Scalar to_montgomery(const Dummy_Scalar& s) { return {s.x}; }
|
||||
|
||||
static Dummy_Scalar from_montgomery(const Dummy_Scalar& s) { return {s.x}; }
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
|
||||
{
|
||||
os << scalar.x;
|
||||
return os;
|
||||
}
|
||||
|
||||
unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
|
||||
{
|
||||
return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
|
||||
}
|
||||
|
||||
friend Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2) { return {(p1.x + p2.x) % p1.p}; }
|
||||
|
||||
friend Dummy_Scalar operator-(Dummy_Scalar p1, const Dummy_Scalar& p2) { return p1 + neg(p2); }
|
||||
|
||||
friend bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
|
||||
|
||||
friend bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
|
||||
|
||||
static Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
|
||||
static Dummy_Scalar rand_host(std::mt19937_64& rand_generator)
|
||||
{
|
||||
// return {(unsigned)rand() % P_MACRO};
|
||||
std::uniform_int_distribution<unsigned> distribution(0, P_MACRO - 1);
|
||||
return {distribution(rand_generator)};
|
||||
}
|
||||
|
||||
static void rand_host_many(Dummy_Scalar* out, int size, std::mt19937_64& rand_generator)
|
||||
{
|
||||
for (int i = 0; i < size; i++)
|
||||
// out[i] = (i % size < 100) ? rand_host(rand_generator) : out[i - 100];
|
||||
out[i] = rand_host(rand_generator);
|
||||
}
|
||||
};
|
||||
|
||||
class Dummy_Projective
|
||||
{
|
||||
public:
|
||||
Dummy_Scalar x;
|
||||
|
||||
static Dummy_Projective zero() { return {0}; }
|
||||
|
||||
static Dummy_Projective one() { return {1}; }
|
||||
|
||||
static Dummy_Projective to_affine(const Dummy_Projective& point) { return {point.x}; }
|
||||
|
||||
static Dummy_Projective from_affine(const Dummy_Projective& point) { return {point.x}; }
|
||||
|
||||
static Dummy_Projective to_montgomery(const Dummy_Projective& point) { return {point.x}; }
|
||||
|
||||
static Dummy_Projective from_montgomery(const Dummy_Projective& point) { return {point.x}; }
|
||||
|
||||
static Dummy_Projective neg(const Dummy_Projective& point) { return {Dummy_Scalar::neg(point.x)}; }
|
||||
|
||||
static Dummy_Projective copy(const Dummy_Projective& point) { return {point.x}; }
|
||||
|
||||
friend Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2) { return {p1.x + p2.x}; }
|
||||
|
||||
friend Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) { return {p1.x - p2.x}; }
|
||||
|
||||
static Dummy_Projective dbl(const Dummy_Projective& point) { return {point.x + point.x}; }
|
||||
|
||||
// friend Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) {
|
||||
// return p1 + neg(p2);
|
||||
// }
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const Dummy_Projective& point)
|
||||
{
|
||||
os << point.x;
|
||||
return os;
|
||||
}
|
||||
|
||||
friend Dummy_Projective operator*(Dummy_Scalar scalar, const Dummy_Projective& point)
|
||||
{
|
||||
Dummy_Projective res = zero();
|
||||
#ifdef CUDA_ARCH
|
||||
UNROLL
|
||||
#endif
|
||||
for (int i = 0; i < Dummy_Scalar::NBITS; i++) {
|
||||
if (i > 0) { res = res + res; }
|
||||
if (scalar.get_scalar_digit(Dummy_Scalar::NBITS - i - 1, 1)) { res = res + point; }
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
friend bool operator==(const Dummy_Projective& p1, const Dummy_Projective& p2) { return (p1.x == p2.x); }
|
||||
|
||||
static bool is_zero(const Dummy_Projective& point) { return point.x == 0; }
|
||||
|
||||
static Dummy_Projective rand_host(std::mt19937_64& rand_generator)
|
||||
{
|
||||
return {(unsigned)rand() % P_MACRO};
|
||||
// return {(unsigned)rand()};
|
||||
}
|
||||
|
||||
static void rand_host_many_affine(Dummy_Projective* out, int size, std::mt19937_64& rand_generator)
|
||||
{
|
||||
for (int i = 0; i < size; i++)
|
||||
out[i] = (i % size < 100) ? to_affine(rand_host(rand_generator)) : out[i - 100];
|
||||
}
|
||||
};
|
||||
#include "dummy_classes.cpp"
|
||||
|
||||
using aff_test = Dummy_Projective;
|
||||
using proj_test = Dummy_Projective;
|
||||
@@ -196,47 +41,18 @@ using sca_test = Dummy_Scalar;
|
||||
|
||||
#ifndef STANDALONE
|
||||
using namespace curve_config;
|
||||
using namespace icicle;
|
||||
#endif
|
||||
|
||||
template <typename Point> // TODO add support for two different point types
|
||||
class ThreadTask
|
||||
{ // TODO turn to class and check which members can be private (or at least function by getter/setter)
|
||||
template<typename Point, typename AddedPoint>
|
||||
class ECaddTask : public TaskBase
|
||||
{
|
||||
public:
|
||||
std::atomic<bool> in_ready{false};
|
||||
ECaddTask() : TaskBase(), p1(Point::zero()), p2(AddedPoint::zero()), result(Point::zero()), return_idx(-1) {}
|
||||
virtual void execute() { result = p1 + p2; }
|
||||
|
||||
Point p1, result; // TODO result will be stored in p1 and support two point types
|
||||
AddedPoint p2;
|
||||
int return_idx;
|
||||
int pidx = 0; // TODO remove after debugging done
|
||||
Point p1;
|
||||
Point p2; // TODO allow different types of points to be added
|
||||
Point result; // TODO Remove and result will be stored in p1
|
||||
std::atomic<bool> out_done{true};
|
||||
|
||||
ThreadTask();
|
||||
ThreadTask(const ThreadTask<Point>& other);
|
||||
void run(int tid, std::vector<int>& idle_idxs, bool& kill_thread);
|
||||
void new_task(const int in_idx, const Point& in_p1, const Point& in_p2);
|
||||
void chain_task(const Point in_p2);
|
||||
// TODO implement below to make class members private
|
||||
inline bool set_ready();
|
||||
inline bool is_done();
|
||||
inline Point get_result();
|
||||
inline void set_idle();
|
||||
};
|
||||
|
||||
template <typename Point>
|
||||
struct WorkThread { // TODO turn to class and check which members can be private (or at least function by getter/setter)
|
||||
int tid;
|
||||
std::vector<int> idle_idxs;
|
||||
std::thread thread;
|
||||
int task_round_robin = 0;
|
||||
std::vector<ThreadTask<Point>> tasks;
|
||||
std::condition_variable idle;
|
||||
std::mutex idle_mtx;
|
||||
bool kill_thread = false; // TODO rethink kill thread as phase already allows breaking
|
||||
|
||||
~WorkThread();
|
||||
void thread_setup(const int tid, const int task_per_thread);
|
||||
void add_ec_tasks(bool& kill_thread);
|
||||
};
|
||||
|
||||
template <typename Point>
|
||||
@@ -244,14 +60,8 @@ class Msm
|
||||
{
|
||||
private:
|
||||
// std::vector<WorkThread<Point>> threads;
|
||||
WorkThread<Point>* threads;
|
||||
TasksManager<ECaddTask<Point, Point>> manager;
|
||||
const unsigned int n_threads;
|
||||
const unsigned int tasks_per_thread; // TODO check effect of the num of tasks one p1 performance and maybe have
|
||||
// separate tasks_per_thread values for p1 and p2
|
||||
int thread_round_robin;
|
||||
bool any_thread_free;
|
||||
|
||||
bool kill_threads;
|
||||
|
||||
const unsigned int c;
|
||||
const unsigned int num_bkts;
|
||||
@@ -282,6 +92,7 @@ private:
|
||||
std::ofstream trace_f;
|
||||
|
||||
void wait_for_idle();
|
||||
// void old_wait_for_idle();
|
||||
|
||||
// template <typename Base>
|
||||
// void push_addition( const unsigned int task_bkt_idx,
|
||||
@@ -291,8 +102,10 @@ private:
|
||||
// Point* result_arr,
|
||||
// bool* );
|
||||
|
||||
void phase1_push_addition(const unsigned int task_bkt_idx, const Point bkt, const Point& base, int pidx);
|
||||
|
||||
template <typename Base>
|
||||
void phase1_push_addition(const unsigned int task_bkt_idx, const Point bkt, const Base& base, int pidx);
|
||||
// void old_phase1_push_addition(const unsigned int task_bkt_idx, const Point bkt, const Base& base, int pidx);
|
||||
|
||||
std::tuple<int, int> phase2_push_addition(const unsigned int task_bkt_idx, const Point& bkt, const Point& base);
|
||||
|
||||
|
||||
162
icicle_v3/backend/cpu/src/curve/dummy_classes.cpp
Normal file
162
icicle_v3/backend/cpu/src/curve/dummy_classes.cpp
Normal file
@@ -0,0 +1,162 @@
|
||||
#include "icicle/errors.h"
|
||||
#include "icicle/config_extension.h"
|
||||
using namespace icicle;
|
||||
|
||||
struct MSMConfig {
|
||||
int nof_bases; /**< Number of bases in the MSM for batched MSM. Set to 0 if all MSMs use the same bases or set to
|
||||
* 'batch X #scalars' otherwise. Default value: 0 (that is reuse bases for all batch elements). */
|
||||
int precompute_factor; /**< The number of extra points to pre-compute for each point. See the
|
||||
* [precompute_msm_bases](@ref precompute_msm_bases) function, `precompute_factor` passed
|
||||
* there needs to be equal to the one used here. Larger values decrease the
|
||||
* number of computations to make, on-line memory footprint, but increase the static
|
||||
* memory footprint. Default value: 1 (i.e. don't pre-compute). */
|
||||
int batch_size; /**< The number of MSMs to compute. Default value: 1. */
|
||||
bool are_scalars_on_device; /**< True if scalars are on device and false if they're on host. Default value:
|
||||
* false. */
|
||||
bool are_scalars_montgomery_form; /**< True if scalars are in Montgomery form and false otherwise. Default value:
|
||||
* true. */
|
||||
bool are_points_on_device; /**< True if points are on device and false if they're on host. Default value: false. */
|
||||
bool are_points_montgomery_form; /**< True if coordinates of points are in Montgomery form and false otherwise.
|
||||
* Default value: true. */
|
||||
bool are_results_on_device; /**< True if the results should be on device and false if they should be on host. If set
|
||||
* to false, `is_async` won't take effect because a synchronization is needed to
|
||||
* transfer results to the host. Default value: false. */
|
||||
bool is_async; /**< Whether to run the MSM asynchronously. If set to true, the MSM function will be
|
||||
* non-blocking and you'd need to synchronize it explicitly by running
|
||||
* `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the MSM
|
||||
* function will block the current CPU thread. */
|
||||
|
||||
ConfigExtension ext; /** backend specific extensions*/
|
||||
};
|
||||
|
||||
struct MsmPreComputeConfig {
|
||||
bool is_input_on_device;
|
||||
bool is_output_on_device;
|
||||
bool is_async;
|
||||
|
||||
ConfigExtension ext; /** backend specific extensions*/
|
||||
};
|
||||
|
||||
#define P_MACRO 1000
|
||||
|
||||
struct Device {
|
||||
};
|
||||
|
||||
class Dummy_Scalar
|
||||
{
|
||||
public:
|
||||
static constexpr unsigned NBITS = 32;
|
||||
|
||||
unsigned x;
|
||||
unsigned p = P_MACRO;
|
||||
// unsigned p = 1<<30;
|
||||
|
||||
static Dummy_Scalar zero() { return {0}; }
|
||||
|
||||
static Dummy_Scalar one() { return {1}; }
|
||||
|
||||
static Dummy_Scalar to_montgomery(const Dummy_Scalar& s) { return {s.x}; }
|
||||
|
||||
static Dummy_Scalar from_montgomery(const Dummy_Scalar& s) { return {s.x}; }
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
|
||||
{
|
||||
os << scalar.x;
|
||||
return os;
|
||||
}
|
||||
|
||||
unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
|
||||
{
|
||||
return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
|
||||
}
|
||||
|
||||
friend Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2) { return {(p1.x + p2.x) % p1.p}; }
|
||||
|
||||
friend Dummy_Scalar operator-(Dummy_Scalar p1, const Dummy_Scalar& p2) { return p1 + neg(p2); }
|
||||
|
||||
friend bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
|
||||
|
||||
friend bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
|
||||
|
||||
static Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
|
||||
static Dummy_Scalar rand_host(std::mt19937_64& rand_generator)
|
||||
{
|
||||
// return {(unsigned)rand() % P_MACRO};
|
||||
std::uniform_int_distribution<unsigned> distribution(0, P_MACRO - 1);
|
||||
return {distribution(rand_generator)};
|
||||
}
|
||||
|
||||
static void rand_host_many(Dummy_Scalar* out, int size, std::mt19937_64& rand_generator)
|
||||
{
|
||||
for (int i = 0; i < size; i++)
|
||||
// out[i] = (i % size < 100) ? rand_host(rand_generator) : out[i - 100];
|
||||
out[i] = rand_host(rand_generator);
|
||||
}
|
||||
};
|
||||
|
||||
class Dummy_Projective
|
||||
{
|
||||
public:
|
||||
Dummy_Scalar x;
|
||||
|
||||
static Dummy_Projective zero() { return {0}; }
|
||||
|
||||
static Dummy_Projective one() { return {1}; }
|
||||
|
||||
static Dummy_Projective to_affine(const Dummy_Projective& point) { return {point.x}; }
|
||||
|
||||
static Dummy_Projective from_affine(const Dummy_Projective& point) { return {point.x}; }
|
||||
|
||||
static Dummy_Projective to_montgomery(const Dummy_Projective& point) { return {point.x}; }
|
||||
|
||||
static Dummy_Projective from_montgomery(const Dummy_Projective& point) { return {point.x}; }
|
||||
|
||||
static Dummy_Projective neg(const Dummy_Projective& point) { return {Dummy_Scalar::neg(point.x)}; }
|
||||
|
||||
static Dummy_Projective copy(const Dummy_Projective& point) { return {point.x}; }
|
||||
|
||||
friend Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2) { return {p1.x + p2.x}; }
|
||||
|
||||
friend Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) { return {p1.x - p2.x}; }
|
||||
|
||||
static Dummy_Projective dbl(const Dummy_Projective& point) { return {point.x + point.x}; }
|
||||
|
||||
// friend Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) {
|
||||
// return p1 + neg(p2);
|
||||
// }
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const Dummy_Projective& point)
|
||||
{
|
||||
os << point.x;
|
||||
return os;
|
||||
}
|
||||
|
||||
friend Dummy_Projective operator*(Dummy_Scalar scalar, const Dummy_Projective& point)
|
||||
{
|
||||
Dummy_Projective res = zero();
|
||||
#ifdef CUDA_ARCH
|
||||
UNROLL
|
||||
#endif
|
||||
for (int i = 0; i < Dummy_Scalar::NBITS; i++) {
|
||||
if (i > 0) { res = res + res; }
|
||||
if (scalar.get_scalar_digit(Dummy_Scalar::NBITS - i - 1, 1)) { res = res + point; }
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
friend bool operator==(const Dummy_Projective& p1, const Dummy_Projective& p2) { return (p1.x == p2.x); }
|
||||
|
||||
static bool is_zero(const Dummy_Projective& point) { return point.x == 0; }
|
||||
|
||||
static Dummy_Projective rand_host(std::mt19937_64& rand_generator)
|
||||
{
|
||||
return {(unsigned)rand() % P_MACRO};
|
||||
// return {(unsigned)rand()};
|
||||
}
|
||||
|
||||
static void rand_host_many_affine(Dummy_Projective* out, int size, std::mt19937_64& rand_generator)
|
||||
{
|
||||
for (int i = 0; i < size; i++)
|
||||
out[i] = (i % size < 100) ? to_affine(rand_host(rand_generator)) : out[i - 100];
|
||||
}
|
||||
};
|
||||
178
icicle_v3/backend/cpu/src/curve/tasks_manager.cpp
Normal file
178
icicle_v3/backend/cpu/src/curve/tasks_manager.cpp
Normal file
@@ -0,0 +1,178 @@
|
||||
#include "tasks_manager.hpp"
|
||||
#include <cassert>
|
||||
#include <stdexcept>
|
||||
|
||||
#define LOG_TASKS_PER_THREAD 2
|
||||
#define TASKS_PER_THREAD (1 << LOG_TASKS_PER_THREAD)
|
||||
#define TASK_IDX_MASK (TASKS_PER_THREAD - 1)
|
||||
#define MANAGER_SLEEP_USEC 10
|
||||
#define THREAD_SLEEP_USEC 1
|
||||
|
||||
inline bool TaskBase::is_ready_for_work() {
|
||||
return status.load(std::memory_order_acquire) == dispatched;
|
||||
}
|
||||
|
||||
inline bool TaskBase::can_push_task() {
|
||||
TaskStatus curr_status = status.load(std::memory_order_acquire);
|
||||
return curr_status == pending_result || curr_status == idle;
|
||||
}
|
||||
|
||||
inline bool TaskBase::has_result() {
|
||||
return status.load(std::memory_order_acquire) == pending_result;
|
||||
}
|
||||
|
||||
inline bool TaskBase::is_idle() {
|
||||
return status.load(std::memory_order_acquire) == idle;
|
||||
}
|
||||
|
||||
void TaskBase::dispatch() {
|
||||
assert(can_push_task());
|
||||
(*father_fifo_tail)++;
|
||||
status.store(dispatched, std::memory_order_release);
|
||||
}
|
||||
|
||||
inline void TaskBase::set_working() {
|
||||
assert(is_ready_for_work());
|
||||
status.store(working, std::memory_order_release);
|
||||
}
|
||||
|
||||
inline void TaskBase::set_pending_result() {
|
||||
assert(status.load(std::memory_order_acquire) == working);
|
||||
status.store(pending_result, std::memory_order_release);
|
||||
}
|
||||
|
||||
inline void TaskBase::set_handled_result() {
|
||||
assert(has_result());
|
||||
status.store(idle, std::memory_order_release);
|
||||
}
|
||||
|
||||
void TaskBase::wait_done() {
|
||||
while (has_result()) {
|
||||
std::this_thread::sleep_for(std::chrono::microseconds(MANAGER_SLEEP_USEC));
|
||||
}
|
||||
}
|
||||
|
||||
template<class Task>
|
||||
TasksManager<Task>::Worker::Worker()
|
||||
: mTasksFifo(TASKS_PER_THREAD),
|
||||
tail(0),
|
||||
head(0),
|
||||
kill(false)
|
||||
{
|
||||
task_executor = std::thread(&TasksManager<Task>::Worker::run, this);
|
||||
}
|
||||
|
||||
template<class Task>
|
||||
TasksManager<Task>::Worker::~Worker() {
|
||||
kill = true;
|
||||
task_executor.join();
|
||||
}
|
||||
|
||||
template<class Task>
|
||||
void TasksManager<Task>::Worker::run() {
|
||||
while (true) {
|
||||
for (head = 0; head < mTasksFifo.size(); head++)
|
||||
{
|
||||
Task* task = &mTasksFifo[head];
|
||||
if (!task->is_ready_for_work()) {
|
||||
std::this_thread::sleep_for(std::chrono::microseconds(THREAD_SLEEP_USEC));
|
||||
continue;
|
||||
}
|
||||
task->set_working();
|
||||
task->execute();
|
||||
task->set_pending_result();
|
||||
}
|
||||
if (kill) return;
|
||||
}
|
||||
}
|
||||
|
||||
template<class Task>
|
||||
bool TasksManager<Task>::Worker::get_free_task(Task*& task_slot) {
|
||||
for (int i = 0; i < mTasksFifo.size(); i++)
|
||||
{
|
||||
int tail_adjusted_idx = (i + tail) & TASK_IDX_MASK; // TODO check % is optimized when base is power of 2
|
||||
|
||||
if (mTasksFifo[tail_adjusted_idx].can_push_task()) // Either idle or pending_result are valid
|
||||
{
|
||||
task_slot = &mTasksFifo[tail_adjusted_idx];
|
||||
return task_slot->has_result();
|
||||
}
|
||||
}
|
||||
task_slot = nullptr;
|
||||
return false;
|
||||
}
|
||||
|
||||
template<class Task>
|
||||
void TasksManager<Task>::Worker::get_completed_task(Task*& task_slot) {
|
||||
for (int i = 0; i < mTasksFifo.size(); i++)
|
||||
{
|
||||
int tail_adjusted_idx = (i + tail) & TASK_IDX_MASK;
|
||||
|
||||
if (mTasksFifo[tail_adjusted_idx].has_result())
|
||||
{
|
||||
task_slot = &mTasksFifo[tail_adjusted_idx];
|
||||
return;
|
||||
}
|
||||
}
|
||||
task_slot = nullptr;
|
||||
}
|
||||
|
||||
template<class Task>
|
||||
bool TasksManager<Task>::Worker::are_all_idle()
|
||||
{
|
||||
bool all_tasks_idle = true;
|
||||
for (Task& task : mTasksFifo) all_tasks_idle = all_tasks_idle && task.is_idle();
|
||||
return all_tasks_idle;
|
||||
}
|
||||
|
||||
template<class Task>
|
||||
TasksManager<Task>::TasksManager(const int nof_workers)
|
||||
: workers(nof_workers)
|
||||
{
|
||||
// Link tasks with their fifo idx
|
||||
for (Worker& worker : workers)
|
||||
{
|
||||
for (Task& task : worker.mTasksFifo)
|
||||
{
|
||||
task.father_fifo_tail = &worker.tail;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class Task>
|
||||
bool TasksManager<Task>::get_free_task(Task*& task_slot) {
|
||||
bool has_task = false;
|
||||
do
|
||||
{
|
||||
for (Worker& worker : workers)
|
||||
{
|
||||
has_task = worker.get_free_task(task_slot);
|
||||
if (task_slot != nullptr) break;
|
||||
}
|
||||
} while (task_slot == nullptr);
|
||||
if (has_task) task_slot->set_handled_result();
|
||||
return has_task;
|
||||
}
|
||||
|
||||
template<class Task>
|
||||
void TasksManager<Task>::get_completed_task(Task*& completed_task) {
|
||||
completed_task = nullptr;
|
||||
bool all_tasks_idle = true;
|
||||
do
|
||||
{
|
||||
all_tasks_idle = true;
|
||||
for (Worker& worker : workers)
|
||||
{
|
||||
worker.get_completed_task(completed_task);
|
||||
if (completed_task != nullptr)
|
||||
{
|
||||
completed_task->set_handled_result();
|
||||
return;
|
||||
}
|
||||
|
||||
all_tasks_idle = all_tasks_idle && worker.are_all_idle();
|
||||
}
|
||||
} while (!all_tasks_idle);
|
||||
// No with_taskd tasks were found in the loop - no complete tasks left to be handled
|
||||
completed_task = nullptr;
|
||||
}
|
||||
66
icicle_v3/backend/cpu/src/curve/tasks_manager.hpp
Normal file
66
icicle_v3/backend/cpu/src/curve/tasks_manager.hpp
Normal file
@@ -0,0 +1,66 @@
|
||||
#ifndef TASKS_MANAGER
|
||||
#define TASKS_MANAGER
|
||||
#include <atomic>
|
||||
#include <thread> // TODO check windows support
|
||||
|
||||
template<class Task>
|
||||
class TasksManager {
|
||||
public:
|
||||
// constructor
|
||||
TasksManager(const int nof_workers);
|
||||
|
||||
// Get a task that isn't occupied
|
||||
// return value signals if there is a valid result to be handled before dispatching a new calculation
|
||||
// Blocking functions
|
||||
bool get_free_task(Task*& task_slot);
|
||||
void get_completed_task(Task*& completed_task); // NOTE the user must handle this task's result before asking for new free tasks
|
||||
private:
|
||||
class Worker {
|
||||
public:
|
||||
Worker();
|
||||
~Worker();
|
||||
void run();
|
||||
// Get a task that isn't occupied
|
||||
// return value signals if there is a valid result to be handled before dispatching a new calculation
|
||||
bool get_free_task(Task*& task_slot);
|
||||
void get_completed_task(Task*& completed_task); // NOTE the user must handle this task's result before asking for new free tasks
|
||||
bool are_all_idle();
|
||||
private:
|
||||
std::thread task_executor;
|
||||
std::vector<Task> mTasksFifo;
|
||||
int tail;
|
||||
int head;
|
||||
bool kill;
|
||||
|
||||
friend TasksManager<Task>::TasksManager(const int nof_workers);
|
||||
};
|
||||
|
||||
std::vector<Worker> workers;
|
||||
};
|
||||
|
||||
class TaskBase {
|
||||
public:
|
||||
TaskBase() : status(idle), father_fifo_tail(nullptr) {}
|
||||
virtual void execute() = 0; // COMMENT should it be private? still needs to be friend of worker
|
||||
void dispatch(); // USER FUNC
|
||||
|
||||
inline bool is_ready_for_work(); // TODO friend functions of worker
|
||||
inline bool can_push_task();
|
||||
inline bool has_result();
|
||||
inline bool is_idle();
|
||||
inline void set_working();
|
||||
inline void set_pending_result();
|
||||
inline void set_handled_result();
|
||||
// Blocking function
|
||||
void wait_done(); // NOTE the user must handle this task's result before asking for new free tasks
|
||||
|
||||
protected:
|
||||
enum TaskStatus {idle, set_task, dispatched, working, pending_result}; // TODO CAPS and eTaskStatus
|
||||
std::atomic<TaskStatus> status;
|
||||
private:
|
||||
int* father_fifo_tail;
|
||||
|
||||
template <class Task>
|
||||
friend TasksManager<Task>::TasksManager(const int nof_workers);
|
||||
};
|
||||
#endif
|
||||
@@ -77,131 +77,43 @@ bool read_inputs(T* arr, const int arr_size, const std::string fname)
|
||||
return status;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void store_inputs(T* arr, const int arr_size, const std::string fname)
|
||||
{
|
||||
std::ofstream out_file(fname);
|
||||
if (!out_file.is_open()) {
|
||||
std::cerr << "Failed to open " << fname << " for writing.\n";
|
||||
return;
|
||||
template <typename T>
|
||||
void store_inputs(T* arr, const int arr_size, const std::string fname)
|
||||
{
|
||||
std::ofstream out_file(fname);
|
||||
if (!out_file.is_open()) {
|
||||
std::cerr << "Failed to open " << fname << " for writing.\n";
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < arr_size; i++) {
|
||||
out_file.write(reinterpret_cast<char*>(&arr[i]), sizeof(T));
|
||||
}
|
||||
out_file.close();
|
||||
}
|
||||
for (int i = 0; i < arr_size; i++) {
|
||||
out_file.write(reinterpret_cast<char*>(&arr[i]), sizeof(T));
|
||||
|
||||
void get_inputs(affine_t* bases, scalar_t* scalars, const int n) // TODO add precompute factor
|
||||
{
|
||||
// Scalars
|
||||
std::string scalar_file = "build/generated_data/scalars_N" + std::to_string(n) + ".dat";
|
||||
if (!read_inputs<scalar_t>(scalars, n, scalar_file)) {
|
||||
std::cout << "Generating scalars.\n";
|
||||
scalar_t::rand_host_many(scalars, n);
|
||||
store_inputs<scalar_t>(scalars, n, scalar_file);
|
||||
}
|
||||
// Bases
|
||||
std::string base_file = "build/generated_data/bases_N" + std::to_string(n) + ".dat";
|
||||
if (!read_inputs<affine_t>(bases, n, base_file)) {
|
||||
std::cout << "Generating bases.\n";
|
||||
projective_t::rand_host_many(bases, n);
|
||||
store_inputs<affine_t>(bases, n, base_file);
|
||||
}
|
||||
}
|
||||
out_file.close();
|
||||
}
|
||||
|
||||
void get_inputs(affine_t* bases, scalar_t* scalars, const int n) // TODO add precompute factor
|
||||
{
|
||||
// Scalars
|
||||
std::string scalar_file = "build/generated_data/scalars_N" + std::to_string(n) + ".dat";
|
||||
if (!read_inputs<scalar_t>(scalars, n, scalar_file)) {
|
||||
std::cout << "Generating scalars.\n";
|
||||
scalar_t::rand_host_many(scalars, n);
|
||||
store_inputs<scalar_t>(scalars, n, scalar_file);
|
||||
}
|
||||
// Bases
|
||||
std::string base_file = "build/generated_data/bases_N" + std::to_string(n) + ".dat";
|
||||
if (!read_inputs<affine_t>(bases, n, base_file)) {
|
||||
std::cout << "Generating bases.\n";
|
||||
projective_t::rand_host_many(bases, n);
|
||||
store_inputs<affine_t>(bases, n, base_file);
|
||||
}
|
||||
}
|
||||
|
||||
// TEST_F(CurveApiTest, MSM)
|
||||
// {
|
||||
// const int logn = 12;
|
||||
// const int N = 1 << logn;
|
||||
// auto scalars = std::make_unique<scalar_t[]>(N);
|
||||
// auto bases = std::make_unique<affine_t[]>(N);
|
||||
|
||||
// bool conv_mont = false;
|
||||
// get_inputs(bases.get(), scalars.get(), N);
|
||||
|
||||
// if (conv_mont) {
|
||||
// for (int i = 0; i < N; i++)
|
||||
// bases[i] = affine_t::to_montgomery(bases[i]);
|
||||
// }
|
||||
// projective_t result_cpu{};
|
||||
// projective_t result_cpu_dbl_n_add{};
|
||||
// projective_t result_cpu_ref{};
|
||||
|
||||
// projective_t result{};
|
||||
|
||||
// auto run = [&](const char* dev_type, projective_t* result, const char* msg, bool measure, int iters) {
|
||||
// Device dev = {dev_type, 0};
|
||||
// icicle_set_device(dev);
|
||||
|
||||
// const int log_p = 2;
|
||||
// const int c = std::max(logn, 8) - 1;
|
||||
// const int pcf = 1 << log_p;
|
||||
|
||||
// const int n_threads = 8;
|
||||
// const int tasks_per_thread = 2;
|
||||
|
||||
// auto config = default_msm_config();
|
||||
// config.ext.set("c", c);
|
||||
// config.ext.set("n_threads", n_threads);
|
||||
// config.ext.set("tasks_per_thread", tasks_per_thread);
|
||||
// config.precompute_factor = pcf;
|
||||
// config.are_scalars_montgomery_form = false;
|
||||
// config.are_points_montgomery_form = conv_mont;
|
||||
|
||||
// auto pc_config = default_msm_pre_compute_config();
|
||||
// pc_config.ext.set("c", c);
|
||||
// pc_config.ext.set("is_mont", config.are_points_montgomery_form);
|
||||
|
||||
// auto precomp_bases = std::make_unique<affine_t[]>(N * pcf);
|
||||
// // TODO update cmake to include directory?
|
||||
// std::string precomp_fname =
|
||||
// "build/generated_data/precomp_N" + std::to_string(N) + "_pcf" + std::to_string(pcf) + ".dat";
|
||||
// if (!read_inputs<affine_t>(precomp_bases.get(), N * pcf, precomp_fname)) {
|
||||
// std::cout << "Precomputing bases." << '\n';
|
||||
// msm_precompute_bases(bases.get(), N, pcf, pc_config, precomp_bases.get());
|
||||
// store_inputs<affine_t>(precomp_bases.get(), N * pcf, precomp_fname);
|
||||
// }
|
||||
|
||||
// // int test_size = 10000;
|
||||
// // std::cout << "NUm additions:\t" << test_size << '\n';
|
||||
// // scalar_t* a = new scalar_t[test_size];
|
||||
// // scalar_t* b = new scalar_t[test_size];
|
||||
// // scalar_t* apb = new scalar_t[test_size];
|
||||
// // {
|
||||
// // scalar_t* bases_p = scalars.get();
|
||||
// // for (int i = 0; i < test_size; i++)
|
||||
// // {
|
||||
// // a[i] = bases_p[i];
|
||||
// // b[i] = bases_p[i + test_size];
|
||||
// // apb[i] = scalar_t::zero();
|
||||
// // }
|
||||
// // }
|
||||
|
||||
// START_TIMER(MSM_sync)
|
||||
// for (int i = 0; i < iters; ++i) {
|
||||
// msm(scalars.get(), precomp_bases.get(), N, config, result);
|
||||
// }
|
||||
// // for (int i = 0; i < test_size; i++)
|
||||
// // {
|
||||
// // apb[i] = a[i] * b[i];
|
||||
// // }
|
||||
// END_TIMER(MSM_sync, msg, measure);
|
||||
// };
|
||||
|
||||
// // run("CPU", &result_cpu_dbl_n_add, "CPU msm", false /*=measure*/, 1 /*=iters*/); // warmup
|
||||
// int iters = 1;
|
||||
// run("CPU", &result_cpu, "CPU msm", VERBOSE /*=measure*/, iters /*=iters*/);
|
||||
// run("CPU_REF", &result_cpu_ref, "CPU_REF msm", VERBOSE /*=measure*/, iters /*=iters*/);
|
||||
|
||||
// std::cout << projective_t::to_affine(result_cpu) << std::endl;
|
||||
// std::cout << projective_t::to_affine(result_cpu_ref) << std::endl;
|
||||
// ASSERT_EQ(result_cpu, result_cpu_ref);
|
||||
// }
|
||||
|
||||
template <typename A, typename P>
|
||||
void MSM_test()
|
||||
{
|
||||
const int logn = 8;
|
||||
const int logn = 17;
|
||||
const int batch = 1; // TODO test batch
|
||||
const int N = 1 << logn;
|
||||
const int precompute_factor = 4;
|
||||
@@ -210,9 +122,11 @@ void get_inputs(affine_t* bases, scalar_t* scalars, const int n) // TODO add pre
|
||||
const int total_nof_elemets = batch * N;
|
||||
auto scalars = std::make_unique<scalar_t[]>(total_nof_elemets);
|
||||
auto bases = std::make_unique<A[]>(N);
|
||||
std::cout << "Starting MSM\n";
|
||||
|
||||
scalar_t::rand_host_many(scalars.get(), total_nof_elemets);
|
||||
P::rand_host_many(bases.get(), N);
|
||||
// scalar_t::rand_host_many(scalars.get(), total_nof_elemets);
|
||||
// P::rand_host_many(bases.get(), N);
|
||||
get_inputs(bases.get(), scalars.get(), N);
|
||||
|
||||
auto result_main = std::make_unique<P[]>(batch);
|
||||
auto result_ref = std::make_unique<P[]>(batch);
|
||||
@@ -361,6 +275,7 @@ TYPED_TEST(CurveSanity, CurveSanityTest)
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
std::cout << "Start\n\n";
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
Reference in New Issue
Block a user