From e2d5c1388eba605ef935ea3155f1b393ee9bc6f0 Mon Sep 17 00:00:00 2001 From: hadaringonyama Date: Tue, 23 Jul 2024 16:54:07 +0300 Subject: [PATCH] automatic nof chunks --- icicle/src/msm/msm.cu | 141 ++++++++++++++++++++++---- icicle/src/msm/tests/huge_msm_test.cu | 137 ++++++++++++------------- icicle/src/msm/tests/msm_test.cu | 32 +++--- 3 files changed, 205 insertions(+), 105 deletions(-) diff --git a/icicle/src/msm/msm.cu b/icicle/src/msm/msm.cu index 68d4bb69..f536b5aa 100644 --- a/icicle/src/msm/msm.cu +++ b/icicle/src/msm/msm.cu @@ -36,10 +36,7 @@ namespace msm { points_out[tid] = P::to_affine(point); } - unsigned get_optimal_c(int bitsize) { return (unsigned)max(ceil(std::log2(bitsize)) - 4.0, 1.0); } - - - unsigned get_optimal_nof_chunks() { return 0; } + unsigned get_optimal_c(int msm_size) { return (unsigned)min(max(ceil(std::log2(msm_size)) - 4.0, 1.0), 22.0); } template @@ -927,6 +924,88 @@ namespace msm { } // namespace + template + unsigned long estimate_msm_memory(const MSMConfig& config, int msm_size, unsigned c, unsigned bitsize, int nof_chunks){ + unsigned nof_bms = (bitsize + c - 1) / c; + unsigned nof_bms_after_precomputation = (nof_bms + config.precompute_factor - 1) / config.precompute_factor; + unsigned long scalars_mem = sizeof(S) * msm_size * config.batch_size; + unsigned long indices_mem = 7 * sizeof(unsigned) * msm_size * config.batch_size * nof_bms; //factor 7 as an estimation for the sorting extra memory. can be reduced by sorting separatly or changing sort algorithm + unsigned long points_mem = sizeof(A) * msm_size * config.precompute_factor * (config.are_points_shared_in_batch? 1 : config.batch_size); + unsigned long buckets_mem = 3 * sizeof(P) * (1< 1){ + scalars_mem = scalars_mem * 2 / nof_chunks; + indices_mem = indices_mem * 2 / nof_chunks; + points_mem = config.batch_size > 1 && config.are_points_shared_in_batch? points_mem * 2 : points_mem * 2 / nof_chunks; + buckets_mem = config.batch_size > 1 ? buckets_mem / nof_chunks : buckets_mem; + } //FIXME: fix divisions + return scalars_mem + indices_mem + points_mem + buckets_mem; + } + + + unsigned long get_device_global_memory(int device) { + cudaDeviceProp deviceProp; + CHK_IF_RETURN(cudaGetDeviceProperties(&deviceProp, device)); + return deviceProp.totalGlobalMem; + } + + template + int get_min_nof_chunks(const MSMConfig& config, int msm_size, unsigned c, unsigned bitsize){ + printf("get min\n"); + unsigned nof_bms = (bitsize + c - 1) / c; + unsigned nof_bms_after_precomputation = (nof_bms + config.precompute_factor - 1) / config.precompute_factor; + unsigned long scalars_mem = sizeof(S) * msm_size * config.batch_size; + unsigned long indices_mem = 7 * sizeof(unsigned) * msm_size * config.batch_size * nof_bms; //factor 7 as an estimation for the sorting extra memory. can be reduced by sorting separatly or changing sort algorithm + unsigned long points_mem = sizeof(A) * msm_size * config.precompute_factor * (config.are_points_shared_in_batch? 1 : config.batch_size); + unsigned long buckets_mem = 3 * sizeof(P) * (1<(gpu_memory); + double lower_bound; + unsigned min_nof_chunks; + // printf("pre cond\n"); + if (config.batch_size < 2){ //single msm + if (buckets_mem > reduced_gpu_memory){ + THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "msm: c is too large"); + } + lower_bound = static_cast(2*(scalars_mem+points_mem+indices_mem)) / static_cast(reduced_gpu_memory - static_cast(buckets_mem)); + min_nof_chunks = static_cast(lower_bound) + 1; + } + else{ + if (config.are_points_shared_in_batch){ //shared points batch + if (2*points_mem > reduced_gpu_memory){ + THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "msm: msm size (including precomputation) is too large for batching"); + } + lower_bound = static_cast(2*(scalars_mem+indices_mem)+buckets_mem) / static_cast(reduced_gpu_memory - static_cast(2*points_mem)); + min_nof_chunks = static_cast(lower_bound) + 1; + } + else{ //different points batch + lower_bound = static_cast(2*(scalars_mem+points_mem+indices_mem)+buckets_mem) / reduced_gpu_memory; + min_nof_chunks = static_cast(lower_bound) + 1; + } + if (min_nof_chunks > config.batch_size){ + THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "msm: either msm size is too large for batching or c is too large"); + } + } + return min_nof_chunks; + } + + template + int get_optimal_nof_chunks(const MSMConfig& config, int msm_size, unsigned c, unsigned bitsize){ + // printf("get opt\n"); + /*if doesnt fit in memory: + nof = shuch that fits + else: + if points/scalars on host + nof = size dependant + else: + nof = 1 + */ + int min_nof_chunks = get_min_nof_chunks(config, msm_size, c, bitsize); + if (min_nof_chunks > 1) return min_nof_chunks; + if (!config.are_points_on_device || !config.are_scalars_on_device) return 4; + return 1; + } + + template cudaError_t msm(const S* scalars, const A* points, int msm_size, const MSMConfig& config, P* results) { @@ -934,13 +1013,31 @@ namespace msm { cudaStream_t& stream = config.ctx.stream; unsigned c = (config.c == 0) ? get_optimal_c(msm_size) : config.c; - int nof_chunks = config.nof_chunks == 0 ? get_optimal_nof_chunks() : config.nof_chunks; + int nof_chunks, min_nof_chunks; + // printf("msm\n"); + if (config.nof_chunks){ + nof_chunks = config.nof_chunks; + // unsigned long memory_estimation = estimate_msm_memory(config, msm_size, c, bitsize, nof_chunks); + // unsigned long gpu_memory = get_device_global_memory(config.ctx.device_id); + // if (memory_estimation > static_cast(gpu_memory)*0.7) { + min_nof_chunks = get_min_nof_chunks(config, msm_size, c, bitsize); + if (min_nof_chunks>nof_chunks){ + // printf("msm memory estimation: %lu, device global memory: %lu\n",memory_estimation, gpu_memory); + THROW_ICICLE_ERR( + IcicleError_t::InvalidArgument, "msm: using given parameters msm wil require too much memory, try using default nof chunks"); + } + } + else{ + nof_chunks = get_optimal_nof_chunks(config, msm_size, c, bitsize); + } + printf("nof chunks %d\n", nof_chunks); + // int nof_chunks = config.nof_chunks == 0 ? get_optimal_nof_chunks() : config.nof_chunks; return CHK_STICKY(multi_chunked_msm(scalars, points, msm_size, nof_chunks, config, results, bitsize, c, stream)); } template - cudaError_t multi_chunked_msm(const S* scalars, const A* points, int msm_size, int nof_chunks, const MSMConfig& config, P* results, int bitsize, int c, cudaStream_t stream){ + cudaError_t multi_chunked_msm(const S* scalars, const A* points, int msm_size, int nof_chunks, const MSMConfig& config, P* results, unsigned bitsize, unsigned c, cudaStream_t stream){ // printf("multi chunked\n"); bool internal_are_scalars_on_device = config.are_scalars_on_device; bool internal_are_points_on_device = config.are_points_on_device; @@ -959,10 +1056,10 @@ namespace msm { A* points_d; S* scalars_d; P* buckets_d; - if (!config.are_scalars_on_device){ + if ((!config.are_scalars_on_device) && nof_chunks > 1){ CHK_IF_RETURN(cudaMallocAsync(&scalars_d, sizeof(S) * scalars_chunk_size*2, stream)); } - if ((!config.are_points_on_device) || process_batch_same_points){ + if (((!config.are_points_on_device) || process_batch_same_points) && nof_chunks > 1){ CHK_IF_RETURN(cudaMallocAsync(&points_d, sizeof(A) * points_chunk_size*2, stream)); } int sub_scalars_size = scalars_chunk_size; @@ -978,7 +1075,7 @@ namespace msm { internal_are_scalars_on_device = true; internal_are_points_on_device = true; } - printf("sub_batch_size %d sub_scalars_size %d\n", sub_batch_size, sub_scalars_size); + // printf("sub_batch_size %d sub_scalars_size %d\n", sub_batch_size, sub_scalars_size); bool internal_are_points_montgomery_form = (batch_same_points && i)? false : config.are_points_montgomery_form; bool return_points_poiter = (i == 0) && batch_same_points && (config.are_points_montgomery_form || !config.are_points_on_device); bool init_points = (i == 0) || !process_batch_same_points; @@ -1004,8 +1101,8 @@ namespace msm { CHK_IF_RETURN(cudaEventDestroy(finish_transfer)); } CHK_IF_RETURN(cudaStreamDestroy(transfer_stream)); - if (!config.are_scalars_on_device) CHK_IF_RETURN(cudaFreeAsync(scalars_d, stream)); - if ((!config.are_points_on_device) || process_batch_same_points) CHK_IF_RETURN(cudaFreeAsync(points_d, stream)); + if ((!config.are_scalars_on_device) && nof_chunks > 1) CHK_IF_RETURN(cudaFreeAsync(scalars_d, stream)); + if (((!config.are_points_on_device) || process_batch_same_points) && nof_chunks > 1) CHK_IF_RETURN(cudaFreeAsync(points_d, stream)); return CHK_LAST(); } @@ -1071,35 +1168,35 @@ namespace msm { } template - cudaError_t chunked_precompute(const A* points, int msm_size, const MSMConfig& config, A* points_precomputed, int nof_chunks){ + cudaError_t chunked_precompute(const A* points, int msm_size, unsigned c, const MSMConfig& config, A* points_precomputed, int nof_chunks){ cudaStream_t stream = config.ctx.stream; A *points_d; A *points_precomputed_d, *points_precomputed_h; int points_size = config.are_points_shared_in_batch? msm_size : config.batch_size * msm_size; bool multi_batch_mode = config.batch_size > 1; bool multi_points = !config.are_points_shared_in_batch; + int chunk_size = (points_size + nof_chunks - 1) / nof_chunks; + if (multi_batch_mode) chunk_size = ((config.batch_size + nof_chunks - 1) / nof_chunks) * msm_size; if (multi_batch_mode && !multi_points) nof_chunks = 1; if (config.are_points_on_device){ points_precomputed_d = points_precomputed; } else{ - CHK_IF_RETURN(cudaMallocAsync(&points_d, sizeof(A) * points_size, stream)); - CHK_IF_RETURN(cudaMallocAsync(&points_precomputed_d, sizeof(A) * points_size*config.precompute_factor, stream)); + CHK_IF_RETURN(cudaMallocAsync(&points_d, sizeof(A) * chunk_size*2, stream)); + CHK_IF_RETURN(cudaMallocAsync(&points_precomputed_d, sizeof(A) * chunk_size*2*config.precompute_factor, stream)); points_precomputed_h = points_precomputed; } - int chunk_size = (points_size + nof_chunks - 1) / nof_chunks; - if (multi_batch_mode) chunk_size = ((config.batch_size + nof_chunks - 1) / nof_chunks) * msm_size; for (int i = 0; i < nof_chunks; i++){ int sub_msm_size = min(points_size - chunk_size * i, chunk_size); if (sub_msm_size <= 0) break; // printf("prec sub_msm_size %d\n", sub_msm_size); if (!config.are_points_on_device) { CHK_IF_RETURN(cudaMemcpyAsync(points_d + (i%2)*chunk_size, points + i*chunk_size, sizeof(A) * sub_msm_size, cudaMemcpyHostToDevice, stream)); //points are on host - CHK_IF_RETURN((precompute_msm_points_chunk(points_d + (i%2)*chunk_size, sub_msm_size, config.precompute_factor, config.c, true, stream, points_precomputed_d + (i%2)*chunk_size*config.precompute_factor))); + CHK_IF_RETURN((precompute_msm_points_chunk(points_d + (i%2)*chunk_size, sub_msm_size, config.precompute_factor, c, true, stream, points_precomputed_d + (i%2)*chunk_size*config.precompute_factor))); CHK_IF_RETURN(cudaMemcpyAsync(points_precomputed_h + i*chunk_size*config.precompute_factor, points_precomputed_d + (i%2)*chunk_size*config.precompute_factor, sizeof(A) * sub_msm_size*config.precompute_factor, cudaMemcpyDeviceToHost, stream)); } else{ - CHK_IF_RETURN((precompute_msm_points_chunk(points + i*chunk_size, sub_msm_size, config.precompute_factor, config.c, true, stream, points_precomputed_d + i*chunk_size*config.precompute_factor))); //poinst are on device + CHK_IF_RETURN((precompute_msm_points_chunk(points + i*chunk_size, sub_msm_size, config.precompute_factor, c, true, stream, points_precomputed_d + i*chunk_size*config.precompute_factor))); //poinst are on device } } if (!config.are_points_on_device){ @@ -1109,11 +1206,13 @@ namespace msm { return CHK_LAST(); } - template + template cudaError_t precompute_msm_points(const A* points, int msm_size, const MSMConfig& config, A* output_points) { - int nof_chunks = config.nof_chunks == 0 ? get_optimal_nof_chunks() : config.nof_chunks; - return CHK_STICKY((chunked_precompute(points, msm_size, config, output_points, nof_chunks))); + unsigned c = (config.c == 0) ? get_optimal_c(msm_size) : config.c; + const int bitsize = (config.bitsize == 0) ? S::NBITS : config.bitsize; + int nof_chunks = config.nof_chunks == 0 ? get_optimal_nof_chunks(config, msm_size, c, bitsize) : config.nof_chunks; + return CHK_STICKY((chunked_precompute(points, msm_size, c, config, output_points, nof_chunks))); } diff --git a/icicle/src/msm/tests/huge_msm_test.cu b/icicle/src/msm/tests/huge_msm_test.cu index 19056377..be6a5273 100644 --- a/icicle/src/msm/tests/huge_msm_test.cu +++ b/icicle/src/msm/tests/huge_msm_test.cu @@ -17,6 +17,8 @@ // using namespace bn254; +// #define PERFORMANCE_ONLY + class Dummy_Scalar { public: @@ -140,7 +142,7 @@ int main(int argc, char** argv) // unsigned msm_size = 1<<21; int precomp_factor = (argc > 3) ? atoi(argv[3]) : 1; int user_c = (argc > 4) ? atoi(argv[4]) : 15; - int nof_chunks = (argc > 5) ? atoi(argv[5]) : 3; + int nof_chunks = (argc > 5) ? atoi(argv[5]) : 0; bool scalars_on_device = (argc > 6) ? atoi(argv[6]) : 0; bool points_on_device = (argc > 7) ? atoi(argv[7]) : 0; bool same_points = (argc > 8) ? atoi(argv[8]) : 0; @@ -154,63 +156,8 @@ int main(int argc, char** argv) test_scalar* scalars_h = new test_scalar[scalars_size]; test_affine* points_h = new test_affine[points_size]; test_affine* points_precomputed_h = new test_affine[points_size*precomp_factor]; - int chunk_size = batch_size > 1? scalars_size : (msm_size + nof_chunks - 1) / nof_chunks; - // int chunk_size = N; - // test_scalar::rand_host_many(scalars, N); - // test_projective::rand_host_many_affine(points, N); - for (int i = 0; i < scalars_size; i++) - { - // scalars[i] = i? scalars[i-1] + test_scalar::one() : test_scalar::zero(); - scalars_h[i] = i>chunk_size-1? scalars_h[i-chunk_size+1] : test_scalar::rand_host(); - if (i100? points_h[i-100] : test_projective::to_affine(test_projective::rand_host()); - // points[i] = test_projective::to_affine(test_projective::generator()); - // std::cout << i << ": "<< points[i] << "\n"; - } - // for (int i = 0; i < N*precomp_factor; i++) - // { - // points_precomputed[i] = test_affine::zero(); - // } - - - - std::cout << "finished generating" << std::endl; - - // projective_t *short_res = (projective_t*)malloc(sizeof(projective_t)); - // test_projective *large_res = (test_projective*)malloc(sizeof(test_projective)); - test_projective res[1]; - test_projective ref[nof_chunks]; - // test_projective batched_large_res[batch_size]; - // fake_point *large_res = (fake_point*)malloc(sizeof(fake_point)); - // fake_point batched_large_res[256]; - - // short_msm(scalars, points, N, short_res); - // for (unsigned i=0;i(scalars+msm_size*i, points+msm_size*i, msm_size, large_res+i, - // false); std::cout<<"final result large"<(scalars, points, batch_size, msm_size, - // batched_large_res, false); cudaStream_t stream; cudaStreamCreate(&stream); @@ -237,27 +184,78 @@ int main(int argc, char** argv) same_points, // are_points_shared_in_batch }; + + nof_chunks = nof_chunks? nof_chunks : msm::get_optimal_nof_chunks(config,msm_size, user_c? user_c : msm::get_optimal_c(msm_size), test_scalar::NBITS); + int chunk_size = (msm_size + nof_chunks - 1) / nof_chunks; + // int chunk_size = N; + + #ifndef PERFORMANCE_ONLY + + // test_scalar::rand_host_many(scalars, N); + // test_projective::rand_host_many_affine(points, N); + for (int i = 0; i < scalars_size; i++) + { + // scalars[i] = i? scalars[i-1] + test_scalar::one() : test_scalar::zero(); + scalars_h[i] = i>chunk_size-1? scalars_h[i-chunk_size+1] : test_scalar::rand_host(); + if (i100? points_h[i-100] : test_projective::to_affine(test_projective::rand_host()); + // points[i] = test_projective::to_affine(test_projective::generator()); + // std::cout << i << ": "<< points[i] << "\n"; + } + + // for (int i = 0; i < N*precomp_factor; i++) + // { + // points_precomputed[i] = test_affine::zero(); + // } + + std::cout << "finished generating" << std::endl; + + #endif + + // projective_t *short_res = (projective_t*)malloc(sizeof(projective_t)); + // test_projective *large_res = (test_projective*)malloc(sizeof(test_projective)); + test_projective res[1]; + test_projective ref[nof_chunks]; + // test_projective batched_large_res[batch_size]; + // fake_point *large_res = (fake_point*)malloc(sizeof(fake_point)); + // fake_point batched_large_res[256]; + + // short_msm(scalars, points, N, short_res); + // for (unsigned i=0;i(scalars+msm_size*i, points+msm_size*i, msm_size, large_res+i, + // false); std::cout<<"final result large"<(scalars, points, batch_size, msm_size, + // batched_large_res, false); + cudaEventCreate(&start); cudaEventCreate(&stop); if (precomp_factor > 1){ - // for (int i = 0; i < nof_chunks; i++) - // { - // bool is_last_iter = i == nof_chunks - 1; - // int sub_msm_size = is_last_iter? N % chunk_size : chunk_size; - // if (sub_msm_size == 0) sub_msm_size = chunk_size; - // // config.points_size = sub_msm_size; - // cudaMemcpyAsync(points_d + (i%2)*chunk_size, points_h + i*chunk_size, sizeof(test_affine) * sub_msm_size, cudaMemcpyHostToDevice); - // msm::precompute_msm_points(points_d + (i%2)*chunk_size, sub_msm_size, config, precomp_points_d + (i%2)*chunk_size*precomp_factor); - // cudaMemcpyAsync(points_precomputed_h + i*chunk_size*precomp_factor, precomp_points_d + (i%2)*chunk_size*precomp_factor, sizeof(test_affine) * sub_msm_size*precomp_factor, cudaMemcpyDeviceToHost); - // } - // msm::chunked_precompute(points_on_device? points_d : points_h, msm_size, config, points_on_device? precomp_points_d : points_precomputed_h, nof_chunks); - msm::precompute_msm_points(points_on_device? points_d : points_h, msm_size, config, points_on_device? precomp_points_d : points_precomputed_h); + msm::precompute_msm_points(points_on_device? points_d : points_h, msm_size, config, points_on_device? precomp_points_d : points_precomputed_h); } + // warm up msm::msm( scalars_on_device? scalars_d : scalars_h, precomp_factor > 1 ? (points_on_device? precomp_points_d : points_precomputed_h) : (points_on_device? points_d : points_h), msm_size, config, res_d); cudaDeviceSynchronize(); + // return 0; // cudaStream_t transfer_stream; // cudaStreamCreate(&transfer_stream); @@ -309,6 +307,9 @@ int main(int argc, char** argv) // auto elapsed1 = std::chrono::duration_cast(end1 - begin1); printf("msm time : %.3f ms.\n", msm_time); + #ifdef PERFORMANCE_ONLY + return 0; + #endif // reference // config.c = 16; // config.precompute_factor = 1; diff --git a/icicle/src/msm/tests/msm_test.cu b/icicle/src/msm/tests/msm_test.cu index baa8dbaf..1abfe891 100644 --- a/icicle/src/msm/tests/msm_test.cu +++ b/icicle/src/msm/tests/msm_test.cu @@ -1,6 +1,6 @@ #include "fields/id.h" // #define FIELD_ID 2 -#define CURVE_ID 1 +#define CURVE_ID 3 #include "curves/curve_config.cuh" // #include "fields/field_config.cuh" @@ -15,6 +15,8 @@ #include "curves/projective.cuh" #include "gpu-utils/device_context.cuh" +// #define PERFORMANCE_ONLY + // using namespace bn254; class Dummy_Scalar @@ -140,7 +142,7 @@ int main(int argc, char** argv) // unsigned msm_size = 1<<21; int precomp_factor = (argc > 3) ? atoi(argv[3]) : 1; int user_c = (argc > 4) ? atoi(argv[4]) : 15; - int nof_chunks = (argc > 5) ? atoi(argv[5]) : 3; + int nof_chunks = (argc > 5) ? atoi(argv[5]) : 0; bool scalars_on_device = (argc > 6) ? atoi(argv[6]) : 0; bool points_on_device = (argc > 7) ? atoi(argv[7]) : 0; bool same_points = (argc > 8) ? atoi(argv[8]) : 0; @@ -154,15 +156,17 @@ int main(int argc, char** argv) test_scalar* scalars_h = new test_scalar[scalars_size]; test_affine* points_h = new test_affine[points_size]; test_affine* points_precomputed_h = new test_affine[points_size*precomp_factor]; - int chunk_size = batch_size > 1? scalars_size : (msm_size + nof_chunks - 1) / nof_chunks; + // int chunk_size = batch_size > 1? scalars_size : (msm_size + nof_chunks - 1) / nof_chunks; // int chunk_size = N; + #ifndef PERFORMANCE_ONLY // test_scalar::rand_host_many(scalars, N); // test_projective::rand_host_many_affine(points, N); for (int i = 0; i < scalars_size; i++) { // scalars[i] = i? scalars[i-1] + test_scalar::one() : test_scalar::zero(); - scalars_h[i] = i>chunk_size-1? scalars_h[i-chunk_size+1] : test_scalar::rand_host(); + // scalars_h[i] = i>chunk_size-1? scalars_h[i-chunk_size+1] : test_scalar::rand_host(); + scalars_h[i] = test_scalar::rand_host(); if (i100? points_h[i-100] : test_projective::to_affine(test_projective::rand_host()); // points[i] = test_projective::to_affine(test_projective::generator()); // std::cout << i << ": "<< points[i] << "\n"; @@ -171,6 +175,7 @@ int main(int argc, char** argv) std::cout << "finished generating" << std::endl; + #endif // projective_t *short_res = (projective_t*)malloc(sizeof(projective_t)); // test_projective *large_res = (test_projective*)malloc(sizeof(test_projective)); @@ -234,24 +239,15 @@ int main(int argc, char** argv) cudaEventCreate(&stop); if (precomp_factor > 1){ - // for (int i = 0; i < nof_chunks; i++) - // { - // bool is_last_iter = i == nof_chunks - 1; - // int sub_msm_size = is_last_iter? N % chunk_size : chunk_size; - // if (sub_msm_size == 0) sub_msm_size = chunk_size; - // // config.points_size = sub_msm_size; - // cudaMemcpyAsync(points_d + (i%2)*chunk_size, points_h + i*chunk_size, sizeof(test_affine) * sub_msm_size, cudaMemcpyHostToDevice); - // msm::precompute_msm_points(points_d + (i%2)*chunk_size, sub_msm_size, config, precomp_points_d + (i%2)*chunk_size*precomp_factor); - // cudaMemcpyAsync(points_precomputed_h + i*chunk_size*precomp_factor, precomp_points_d + (i%2)*chunk_size*precomp_factor, sizeof(test_affine) * sub_msm_size*precomp_factor, cudaMemcpyDeviceToHost); - // } - // msm::chunked_precompute(points_on_device? points_d : points_h, msm_size, config, points_on_device? precomp_points_d : points_precomputed_h, nof_chunks); - msm::precompute_msm_points(points_on_device? points_d : points_h, msm_size, config, points_on_device? precomp_points_d : points_precomputed_h); + msm::precompute_msm_points(points_on_device? points_d : points_h, msm_size, config, points_on_device? precomp_points_d : points_precomputed_h); } // warm up msm::msm( scalars_on_device? scalars_d : scalars_h, precomp_factor > 1 ? (points_on_device? precomp_points_d : points_precomputed_h) : (points_on_device? points_d : points_h), msm_size, config, res_d); cudaDeviceSynchronize(); + // return 0; + // cudaStream_t transfer_stream; // cudaStreamCreate(&transfer_stream); @@ -307,6 +303,10 @@ int main(int argc, char** argv) // auto elapsed1 = std::chrono::duration_cast(end1 - begin1); printf("msm time : %.3f ms.\n", msm_time); + // #ifdef PERFORMANCE_ONLY + // return 0; + // #endif + // reference config.c = 16; config.precompute_factor = 1;