From 3f9f228a231d696bb77361f5d4f0bdf384dfdc45 Mon Sep 17 00:00:00 2001 From: Antoniu Pop Date: Fri, 24 Mar 2023 14:06:13 +0000 Subject: [PATCH] feat(compiler): add runtime support for batched operations in SDFG/GPU. --- .../Runtime/stream_emulator_api.h | 3 +- .../compiler/lib/Runtime/GPUDFG.cpp | 351 ++++++++++++------ .../compiler/lib/Runtime/StreamEmulator.cpp | 18 + 3 files changed, 251 insertions(+), 121 deletions(-) diff --git a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/stream_emulator_api.h b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/stream_emulator_api.h index 367141bc8..ce9693e85 100644 --- a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/stream_emulator_api.h +++ b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/stream_emulator_api.h @@ -14,7 +14,8 @@ typedef enum stream_type { TS_STREAM_TYPE_X86_TO_TOPO_LSAP, TS_STREAM_TYPE_TOPO_TO_TOPO_LSAP, - TS_STREAM_TYPE_TOPO_TO_X86_LSAP + TS_STREAM_TYPE_TOPO_TO_X86_LSAP, + TS_STREAM_TYPE_X86_TO_X86_LSAP } stream_type; extern "C" { diff --git a/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp b/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp index 7ec58a46f..ee65a3fcc 100644 --- a/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp +++ b/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp @@ -35,6 +35,30 @@ namespace { static std::atomic next_device = {0}; static size_t num_devices = 0; +static inline size_t memref_get_data_size(MemRef2 &m) { + return m.sizes[0] * m.sizes[1] * sizeof(uint64_t); +} + +static inline void memref_copy_contiguous(MemRef2 &out, MemRef2 &in) { + assert(in.sizes[0] == out.sizes[0] && in.sizes[1] == out.sizes[1] && + "memref_copy_contiguous sizes differ"); + assert(in.strides[0] == out.strides[0] && in.strides[1] == out.strides[1] && + "memref_copy_contiguous strides differ"); + assert(in.strides[0] == in.sizes[1] && in.strides[1] == 1 && + "memref_copy_contiguous strides not compatible with contiguous " + "storage."); + memcpy(out.aligned + out.offset, in.aligned + in.offset, + memref_get_data_size(in)); +} + +static inline MemRef2 memref_copy_alloc(MemRef2 &m) { + uint64_t *data = (uint64_t *)malloc(memref_get_data_size(m)); + MemRef2 ret = { + data, data, 0, {m.sizes[0], m.sizes[1]}, {m.strides[0], m.strides[1]}}; + memref_copy_contiguous(ret, m); + return ret; +} + struct Void {}; union Param { Void _; @@ -58,7 +82,6 @@ struct PBS_buffer { max_pbs_buffer_samples, cuda_get_max_shared_memory(gpu_index), true); } ~PBS_buffer() { - assert(pbs_buffer != nullptr); cleanup_cuda_bootstrap_amortized(gpu_stream, gpu_index, &pbs_buffer); } int8_t *get_pbs_buffer(void *stream, uint32_t gpu_idx, @@ -69,7 +92,6 @@ struct PBS_buffer { assert(input_lwe_ciphertext_count <= max_pbs_buffer_samples); assert(stream == gpu_stream); assert(gpu_idx == gpu_index); - assert(pbs_buffer != nullptr); return pbs_buffer; } @@ -126,21 +148,17 @@ private: }; struct Dependence { int32_t location; - int32_t rank; MemRef2 host_data; void *device_data; bool onHostReady; bool hostAllocated; bool used; - bool read; - Dependence(int32_t l, int32_t r, MemRef2 &hd, void *dd, bool ohr, - bool alloc = false) - : location(l), rank(r), host_data(hd), device_data(dd), onHostReady(ohr), - hostAllocated(alloc), used(false), read(false) {} - Dependence(int32_t l, int32_t r, uint64_t val, void *dd, bool ohr, - bool alloc = false) - : location(l), rank(r), device_data(dd), onHostReady(ohr), - hostAllocated(alloc), used(false), read(false) { + Dependence(int32_t l, MemRef2 hd, void *dd, bool ohr, bool alloc = false) + : location(l), host_data(hd), device_data(dd), onHostReady(ohr), + hostAllocated(alloc), used(false) {} + Dependence(int32_t l, uint64_t val, void *dd, bool ohr, bool alloc = false) + : location(l), device_data(dd), onHostReady(ohr), hostAllocated(alloc), + used(false) { *host_data.aligned = val; } inline void free_data(GPU_DFG *dfg) { @@ -166,7 +184,6 @@ struct Process { Param output_lwe_dim; Param poly_size; Param glwe_dim; - Param precision; Param output_size; Context ctx; void (*fun)(Process *); @@ -174,6 +191,7 @@ struct Process { }; static inline void schedule_kernel(Process *p) { p->fun(p); } + struct Stream { stream_type type; Dependence *dep; @@ -185,15 +203,14 @@ struct Stream { ~Stream() { if (dep != nullptr) dep->free_data(dfg); - delete producer; + if (producer != nullptr) + delete producer; } void put(Dependence *d) { if (type == TS_STREAM_TYPE_X86_TO_TOPO_LSAP) { assert(d->onHostReady && "Host-to-device stream should have data initially on host."); - size_t data_size = sizeof(uint64_t); - for (int i = 0; i < d->rank; ++i) - data_size *= d->host_data.sizes[i]; + size_t data_size = memref_get_data_size(d->host_data); d->device_data = cuda_malloc_async( data_size, (cudaStream_t *)dfg->gpu_stream, dfg->gpu_idx); cuda_memcpy_async_to_gpu( @@ -239,21 +256,20 @@ struct Stream { dep->used = true; // If this was already copied to host, copy out if (dep->onHostReady) { - memref_copy_one_rank(dep->host_data.allocated, dep->host_data.aligned, - dep->host_data.offset, dep->host_data.sizes[0], - dep->host_data.strides[0], out.allocated, - out.aligned, out.offset, out.sizes[0], - out.strides[0]); + memref_copy_contiguous(out, dep->host_data); } else { + size_t data_size = memref_get_data_size(dep->host_data); cuda_memcpy_async_to_cpu(out.aligned + out.offset, dep->device_data, - out.sizes[0] * sizeof(uint64_t), - (cudaStream_t *)dfg->gpu_stream, dep->location); + data_size, (cudaStream_t *)dfg->gpu_stream, + dep->location); cudaStreamSynchronize(*(cudaStream_t *)dfg->gpu_stream); // After this synchronization point, all of the host-side // allocated memory can be freed as we know all asynchronous // operations have finished. dfg->free_stream_order_dependent_data(); + dep->host_data = memref_copy_alloc(out); dep->onHostReady = true; + dep->hostAllocated = true; } return dep; } @@ -264,9 +280,7 @@ struct Stream { if (location == host_location) { if (dep->onHostReady) return dep; - size_t data_size = sizeof(uint64_t); - for (int i = 0; i < dep->rank; ++i) - data_size *= dep->host_data.sizes[i]; + size_t data_size = memref_get_data_size(dep->host_data); dep->host_data.allocated = dep->host_data.aligned = (uint64_t *)malloc(data_size); dep->hostAllocated = true; @@ -332,37 +346,75 @@ make_process_2_1(void *dfg, void *sin1, void *sin2, void *sout, return p; } +[[maybe_unused]] static MemRef2 sdfg_gpu_debug_dependence(Dependence *d, + cudaStream_t *s) { + if (d->onHostReady) + return d->host_data; + size_t data_size = memref_get_data_size(d->host_data); + uint64_t *data = (uint64_t *)malloc(data_size); + MemRef2 ret = {data, + data, + 0, + {d->host_data.sizes[0], d->host_data.sizes[1]}, + {d->host_data.strides[0], d->host_data.strides[1]}}; + cuda_memcpy_async_to_cpu(data, d->device_data, data_size, s, d->location); + cudaStreamSynchronize(*s); + return ret; +} + +[[maybe_unused]] static bool +sdfg_gpu_debug_compare_memref(MemRef2 &a, MemRef2 &b, char const *msg) { + if (a.sizes[0] != b.sizes[0] || a.sizes[1] != b.sizes[1] || + a.strides[0] != b.strides[0] || a.strides[1] != b.strides[1]) + return false; + size_t data_size = memref_get_data_size(a); + for (int i = 0; i < data_size / sizeof(uint64_t); ++i) + if ((a.aligned + a.offset)[i] != (b.aligned + b.offset)[i]) { + std::cout << msg << " - memrefs differ at position " << i << " " + << (a.aligned + a.offset)[i] << " " << (b.aligned + b.offset)[i] + << "\n"; + return false; + } + return true; +} + // Stream emulator processes void memref_keyswitch_lwe_u64_process(Process *p) { Dependence *idep = p->input_streams[0]->get(p->dfg->gpu_idx); + uint64_t num_samples = idep->host_data.sizes[0]; + MemRef2 out = { + 0, 0, 0, {num_samples, p->output_size.val}, {p->output_size.val, 1}}; void *ct0_gpu = idep->device_data; void *ksk_gpu = p->ctx.val->get_ksk_gpu( p->level.val, p->input_lwe_dim.val, p->output_lwe_dim.val, p->dfg->gpu_idx, (cudaStream_t *)p->dfg->gpu_stream); - MemRef2 out = {0, 0, 0, {p->output_size.val}, {1}}; - void *out_gpu = - cuda_malloc_async(out.sizes[0] * sizeof(uint64_t), - (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx); + size_t data_size = memref_get_data_size(out); + void *out_gpu = cuda_malloc_async( + data_size, (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx); // Schedule the keyswitch kernel on the GPU cuda_keyswitch_lwe_ciphertext_vector_64( (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx, out_gpu, ct0_gpu, ksk_gpu, p->input_lwe_dim.val, p->output_lwe_dim.val, p->base_log.val, - p->level.val, 1 /*batch size*/); + p->level.val, num_samples); Dependence *dep = - new Dependence((int32_t)p->dfg->gpu_idx, 1, out, out_gpu, false); + new Dependence((int32_t)p->dfg->gpu_idx, out, out_gpu, false); p->output_streams[0]->put(dep); } -// Construct the glwe accumulator (on CPU) then put it on a stream as -// input to the bootstrap -void memref_bootstrap_glwe_accumulator_process(Process *p) { +void memref_bootstrap_lwe_u64_process(Process *p) { + assert(p->output_size.val == p->glwe_dim.val * p->poly_size.val + 1); + void *fbsk_gpu = p->ctx.val->get_bsk_gpu( + p->input_lwe_dim.val, p->poly_size.val, p->level.val, p->glwe_dim.val, + p->dfg->gpu_idx, (cudaStream_t *)p->dfg->gpu_stream); + Dependence *idep0 = p->input_streams[0]->get(p->dfg->gpu_idx); + void *ct0_gpu = idep0->device_data; + uint64_t glwe_ct_len = p->poly_size.val * (p->glwe_dim.val + 1); uint64_t glwe_ct_size = glwe_ct_len * sizeof(uint64_t); uint64_t *glwe_ct = (uint64_t *)malloc(glwe_ct_size); - Dependence *idep = p->input_streams[0]->get(host_location); - MemRef2 &mtlu = idep->host_data; + Dependence *idep1 = p->input_streams[1]->get(host_location); + MemRef2 &mtlu = idep1->host_data; auto tlu = mtlu.aligned + mtlu.offset; - // Glwe trivial encryption for (size_t i = 0; i < p->poly_size.val * p->glwe_dim.val; i++) { glwe_ct[i] = 0; @@ -370,27 +422,18 @@ void memref_bootstrap_glwe_accumulator_process(Process *p) { for (size_t i = 0; i < p->poly_size.val; i++) { glwe_ct[p->poly_size.val * p->glwe_dim.val + i] = tlu[i]; } - MemRef2 m = {glwe_ct, glwe_ct, 0, {glwe_ct_len}, {1}}; - Dependence *dep = new Dependence(host_location, 1, m, nullptr, true, true); - p->output_streams[0]->put(dep); -} + void *glwe_ct_gpu = cuda_malloc_async( + glwe_ct_size, (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx); + cuda_memcpy_async_to_gpu(glwe_ct_gpu, glwe_ct, glwe_ct_size, + (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx); -void memref_bootstrap_lwe_u64_process(Process *p) { - assert(p->output_size.val == p->glwe_dim.val * p->poly_size.val + 1); - uint32_t num_samples = 1; // TODO batching - void *fbsk_gpu = p->ctx.val->get_bsk_gpu( - p->input_lwe_dim.val, p->poly_size.val, p->level.val, p->glwe_dim.val, - p->dfg->gpu_idx, (cudaStream_t *)p->dfg->gpu_stream); - Dependence *idep0 = p->input_streams[0]->get(p->dfg->gpu_idx); - Dependence *idep1 = p->input_streams[1]->get(p->dfg->gpu_idx); - void *ct0_gpu = idep0->device_data; - void *glwe_ct_gpu = idep1->device_data; - - MemRef2 out = {0, 0, 0, {p->output_size.val}, {1}}; - uint64_t out_batch_size = out.sizes[0]; - void *out_gpu = - cuda_malloc_async(out_batch_size * sizeof(uint64_t), - (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx); + uint64_t num_samples = idep0->host_data.sizes[0]; + MemRef2 out = { + 0, 0, 0, {num_samples, p->output_size.val}, {p->output_size.val, 1}}; + size_t data_size = memref_get_data_size(out); + void *out_gpu = cuda_malloc_async( + data_size, (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx); + cudaMemsetAsync(out_gpu, 0, data_size, *(cudaStream_t *)p->dfg->gpu_stream); // Move test vector indexes to the GPU, the test vector indexes is set of 0 uint32_t num_test_vectors = 1, lwe_idx = 0, test_vector_idxes_size = num_samples * sizeof(uint64_t); @@ -411,11 +454,10 @@ void memref_bootstrap_lwe_u64_process(Process *p) { p->input_lwe_dim.val, p->glwe_dim.val, p->poly_size.val, p->base_log.val, p->level.val, num_samples, num_test_vectors, lwe_idx, cuda_get_max_shared_memory(p->dfg->gpu_idx)); - p->dfg->drop_pbs_buffer(); cuda_drop_async(test_vector_idxes_gpu, (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx); Dependence *dep = - new Dependence((int32_t)p->dfg->gpu_idx, 1, out, out_gpu, false); + new Dependence((int32_t)p->dfg->gpu_idx, out, out_gpu, false); // As streams are not synchronized, we can only free this vector // after a later synchronization point where we are guaranteed that // this vector is no longer needed. @@ -427,15 +469,15 @@ void memref_add_lwe_ciphertexts_u64_process(Process *p) { Dependence *idep0 = p->input_streams[0]->get(p->dfg->gpu_idx); Dependence *idep1 = p->input_streams[1]->get(p->dfg->gpu_idx); MemRef2 ct0 = idep0->host_data; - void *out_gpu = - cuda_malloc_async(ct0.sizes[0] * sizeof(uint64_t), - (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx); - MemRef2 out = {0, 0, 0, {ct0.sizes[0]}, {1}}; + uint64_t num_samples = ct0.sizes[0]; + MemRef2 out = {0, 0, 0, {num_samples, ct0.sizes[1]}, {ct0.sizes[1], 1}}; + size_t data_size = memref_get_data_size(out); + void *out_gpu = cuda_malloc_async( + data_size, (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx); cuda_add_lwe_ciphertext_vector_64( (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx, out_gpu, - idep0->device_data, idep1->device_data, - /*p->input_lwe_dim.val*/ ct0.sizes[0] - 1, 1 /* num_samples */); - Dependence *dep = new Dependence(p->dfg->gpu_idx, 1, out, out_gpu, false); + idep0->device_data, idep1->device_data, ct0.sizes[1] - 1, num_samples); + Dependence *dep = new Dependence(p->dfg->gpu_idx, out, out_gpu, false); p->output_streams[0]->put(dep); } @@ -443,16 +485,15 @@ void memref_add_plaintext_lwe_ciphertext_u64_process(Process *p) { Dependence *idep0 = p->input_streams[0]->get(p->dfg->gpu_idx); Dependence *idep1 = p->input_streams[1]->get(p->dfg->gpu_idx); MemRef2 ct0 = idep0->host_data; - void *out_gpu = - cuda_malloc_async(ct0.sizes[0] * sizeof(uint64_t), - (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx); - MemRef2 out = {0, 0, 0, {ct0.sizes[0]}, {1}}; + uint64_t num_samples = ct0.sizes[0]; + MemRef2 out = {0, 0, 0, {num_samples, ct0.sizes[1]}, {ct0.sizes[1], 1}}; + size_t data_size = memref_get_data_size(out); + void *out_gpu = cuda_malloc_async( + data_size, (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx); cuda_add_lwe_ciphertext_vector_plaintext_vector_64( (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx, out_gpu, - idep0->device_data, idep1->device_data, - /*p->input_lwe_dim.val*/ ct0.sizes[0] - 1, 1 /* num_samples */); - - Dependence *dep = new Dependence(p->dfg->gpu_idx, 1, out, out_gpu, false); + idep0->device_data, idep1->device_data, ct0.sizes[1] - 1, num_samples); + Dependence *dep = new Dependence(p->dfg->gpu_idx, out, out_gpu, false); p->output_streams[0]->put(dep); } @@ -460,31 +501,30 @@ void memref_mul_cleartext_lwe_ciphertext_u64_process(Process *p) { Dependence *idep0 = p->input_streams[0]->get(p->dfg->gpu_idx); Dependence *idep1 = p->input_streams[1]->get(p->dfg->gpu_idx); MemRef2 ct0 = idep0->host_data; - void *out_gpu = - cuda_malloc_async(ct0.sizes[0] * sizeof(uint64_t), - (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx); - MemRef2 out = {0, 0, 0, {ct0.sizes[0]}, {1}}; + uint64_t num_samples = ct0.sizes[0]; + MemRef2 out = {0, 0, 0, {num_samples, ct0.sizes[1]}, {ct0.sizes[1], 1}}; + size_t data_size = memref_get_data_size(out); + void *out_gpu = cuda_malloc_async( + data_size, (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx); cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx, out_gpu, - idep0->device_data, idep1->device_data, - /*p->input_lwe_dim.val*/ ct0.sizes[0] - 1, 1 /* num_samples */); - - Dependence *dep = new Dependence(p->dfg->gpu_idx, 1, out, out_gpu, false); + idep0->device_data, idep1->device_data, ct0.sizes[1] - 1, num_samples); + Dependence *dep = new Dependence(p->dfg->gpu_idx, out, out_gpu, false); p->output_streams[0]->put(dep); } void memref_negate_lwe_ciphertext_u64_process(Process *p) { Dependence *idep = p->input_streams[0]->get(p->dfg->gpu_idx); MemRef2 ct0 = idep->host_data; - void *out_gpu = - cuda_malloc_async(ct0.sizes[0] * sizeof(uint64_t), - (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx); - MemRef2 out = {0, 0, 0, {ct0.sizes[0]}, {1}}; + uint64_t num_samples = ct0.sizes[0]; + MemRef2 out = {0, 0, 0, {num_samples, ct0.sizes[1]}, {ct0.sizes[1], 1}}; + size_t data_size = memref_get_data_size(out); + void *out_gpu = cuda_malloc_async( + data_size, (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx); cuda_negate_lwe_ciphertext_vector_64( (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx, out_gpu, - idep->device_data, - /*p->input_lwe_dim.val*/ ct0.sizes[0] - 1, 1 /* num_samples */); - Dependence *dep = new Dependence(p->dfg->gpu_idx, 1, out, out_gpu, false); + idep->device_data, ct0.sizes[1] - 1, num_samples); + Dependence *dep = new Dependence(p->dfg->gpu_idx, out, out_gpu, false); p->output_streams[0]->put(dep); } @@ -550,39 +590,74 @@ void stream_emulator_make_memref_keyswitch_lwe_u64_process( void stream_emulator_make_memref_bootstrap_lwe_u64_process( void *dfg, void *sin1, void *sin2, void *sout, uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level, uint32_t base_log, uint32_t glwe_dim, - uint32_t precision, uint32_t output_size, void *context) { - - // We need to generate two processes: one for building the glwe - // accumulator and one for the bootstrap (plus a stream to connect). - void *accu_s = - stream_emulator_make_memref_stream("", TS_STREAM_TYPE_X86_TO_TOPO_LSAP); - Process *accu_p = make_process_1_1(dfg, sin2, accu_s, - memref_bootstrap_glwe_accumulator_process); - accu_p->input_lwe_dim.val = input_lwe_dim; - accu_p->poly_size.val = poly_size; - accu_p->level.val = level; - accu_p->base_log.val = base_log; - accu_p->glwe_dim.val = glwe_dim; - accu_p->precision.val = precision; - accu_p->output_size.val = output_size; - accu_p->ctx.val = (RuntimeContext *)context; - static int count_ = 0; - sprintf(accu_p->name, "glwe_accumulator_%d", count_++); - - Process *p = make_process_2_1(dfg, sin1, accu_s, sout, - memref_bootstrap_lwe_u64_process); + uint32_t output_size, void *context) { + // The TLU does not need to be sent to GPU + ((Stream *)sin2)->type = TS_STREAM_TYPE_X86_TO_X86_LSAP; + Process *p = + make_process_2_1(dfg, sin1, sin2, sout, memref_bootstrap_lwe_u64_process); p->input_lwe_dim.val = input_lwe_dim; p->poly_size.val = poly_size; p->level.val = level; p->base_log.val = base_log; p->glwe_dim.val = glwe_dim; - p->precision.val = precision; p->output_size.val = output_size; p->ctx.val = (RuntimeContext *)context; static int count = 0; sprintf(p->name, "bootstrap_%d", count++); } +void stream_emulator_make_memref_batched_add_lwe_ciphertexts_u64_process( + void *dfg, void *sin1, void *sin2, void *sout) { + stream_emulator_make_memref_add_lwe_ciphertexts_u64_process(dfg, sin1, sin2, + sout); +} + +void stream_emulator_make_memref_batched_add_plaintext_lwe_ciphertext_u64_process( + void *dfg, void *sin1, void *sin2, void *sout) { + stream_emulator_make_memref_add_plaintext_lwe_ciphertext_u64_process( + dfg, sin1, sin2, sout); +} +void stream_emulator_make_memref_batched_add_plaintext_cst_lwe_ciphertext_u64_process( + void *dfg, void *sin1, void *sin2, void *sout) { + stream_emulator_make_memref_add_plaintext_lwe_ciphertext_u64_process( + dfg, sin1, sin2, sout); +} + +void stream_emulator_make_memref_batched_mul_cleartext_lwe_ciphertext_u64_process( + void *dfg, void *sin1, void *sin2, void *sout) { + stream_emulator_make_memref_mul_cleartext_lwe_ciphertext_u64_process( + dfg, sin1, sin2, sout); +} +void stream_emulator_make_memref_batched_mul_cleartext_cst_lwe_ciphertext_u64_process( + void *dfg, void *sin1, void *sin2, void *sout) { + stream_emulator_make_memref_mul_cleartext_lwe_ciphertext_u64_process( + dfg, sin1, sin2, sout); +} + +void stream_emulator_make_memref_batched_negate_lwe_ciphertext_u64_process( + void *dfg, void *sin1, void *sout) { + stream_emulator_make_memref_negate_lwe_ciphertext_u64_process(dfg, sin1, + sout); +} + +void stream_emulator_make_memref_batched_keyswitch_lwe_u64_process( + void *dfg, void *sin1, void *sout, uint32_t level, uint32_t base_log, + uint32_t input_lwe_dim, uint32_t output_lwe_dim, uint32_t output_size, + void *context) { + stream_emulator_make_memref_keyswitch_lwe_u64_process( + dfg, sin1, sout, level, base_log, input_lwe_dim, output_lwe_dim, + output_size, context); +} + +void stream_emulator_make_memref_batched_bootstrap_lwe_u64_process( + void *dfg, void *sin1, void *sin2, void *sout, uint32_t input_lwe_dim, + uint32_t poly_size, uint32_t level, uint32_t base_log, uint32_t glwe_dim, + uint32_t output_size, void *context) { + stream_emulator_make_memref_bootstrap_lwe_u64_process( + dfg, sin1, sin2, sout, input_lwe_dim, poly_size, level, base_log, + glwe_dim, output_size, context); +} + void *stream_emulator_make_uint64_stream(const char *name, stream_type stype) { return (void *)new Stream(stype); } @@ -590,14 +665,14 @@ void stream_emulator_put_uint64(void *stream, uint64_t e) { Stream *s = (Stream *)stream; uint64_t *data = (uint64_t *)malloc(sizeof(uint64_t)); *data = e; - MemRef2 m = {data, data, 0, {1}, {1}}; - Dependence *dep = new Dependence(host_location, 1, m, nullptr, true, true); + MemRef2 m = {data, data, 0, {1, 1}, {1, 1}}; + Dependence *dep = new Dependence(host_location, m, nullptr, true, true); s->put(dep); } uint64_t stream_emulator_get_uint64(void *stream) { uint64_t res; auto s = (Stream *)stream; - MemRef2 m = {&res, &res, 0, {1}, {1}}; + MemRef2 m = {&res, &res, 0, {1, 1}, {1, 1}}; s->get_on_host(m); return res; } @@ -608,16 +683,52 @@ void *stream_emulator_make_memref_stream(const char *name, stream_type stype) { void stream_emulator_put_memref(void *stream, uint64_t *allocated, uint64_t *aligned, uint64_t offset, uint64_t size, uint64_t stride) { + assert(stride == 1 && "Strided memrefs not supported"); Stream *s = (Stream *)stream; - MemRef2 m = {allocated, aligned, offset, {size}, {stride}}; - Dependence *dep = new Dependence(host_location, 1, m, nullptr, true); + MemRef2 m = {allocated, aligned, offset, {1, size}, {size, stride}}; + Dependence *dep = + new Dependence(host_location, memref_copy_alloc(m), nullptr, true, true); s->put(dep); } void stream_emulator_get_memref(void *stream, uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset, uint64_t out_size, uint64_t out_stride) { - MemRef2 mref = { - out_allocated, out_aligned, out_offset, {out_size}, {out_stride}}; + assert(out_stride == 1 && "Strided memrefs not supported"); + MemRef2 mref = {out_allocated, + out_aligned, + out_offset, + {1, out_size}, + {out_size, out_stride}}; + auto s = (Stream *)stream; + s->get_on_host(mref); +} + +void *stream_emulator_make_memref_batch_stream(const char *name, + stream_type stype) { + return (void *)new Stream(stype); +} +void stream_emulator_put_memref_batch(void *stream, uint64_t *allocated, + uint64_t *aligned, uint64_t offset, + uint64_t size0, uint64_t size1, + uint64_t stride0, uint64_t stride1) { + assert(stride1 == 1 && "Strided memrefs not supported"); + Stream *s = (Stream *)stream; + MemRef2 m = {allocated, aligned, offset, {size0, size1}, {stride0, stride1}}; + Dependence *dep = + new Dependence(host_location, memref_copy_alloc(m), nullptr, true, true); + s->put(dep); +} +void stream_emulator_get_memref_batch(void *stream, uint64_t *out_allocated, + uint64_t *out_aligned, + uint64_t out_offset, uint64_t out_size0, + uint64_t out_size1, uint64_t out_stride0, + uint64_t out_stride1) { + assert(out_stride1 == 1 && "Strided memrefs not supported"); + MemRef2 mref = {out_allocated, + out_aligned, + out_offset, + {out_size0, out_size1}, + {out_stride0, out_stride1}}; auto s = (Stream *)stream; s->get_on_host(mref); } diff --git a/compilers/concrete-compiler/compiler/lib/Runtime/StreamEmulator.cpp b/compilers/concrete-compiler/compiler/lib/Runtime/StreamEmulator.cpp index 992e1c23f..283d8ff2b 100644 --- a/compilers/concrete-compiler/compiler/lib/Runtime/StreamEmulator.cpp +++ b/compilers/concrete-compiler/compiler/lib/Runtime/StreamEmulator.cpp @@ -366,6 +366,24 @@ void stream_emulator_get_memref(void *stream, uint64_t *out_allocated, free(mref.allocated); } +void *stream_emulator_make_memref_batch_stream(const char *name, + stream_type stype) { + assert(0 && "Batched operations not implemented in the StreamEmulator."); +} +void stream_emulator_put_memref_batch(void *stream, uint64_t *allocated, + uint64_t *aligned, uint64_t offset, + uint64_t size0, uint64_t size1, + uint64_t stride0, uint64_t stride1) { + assert(0 && "Batched operations not implemented in the StreamEmulator."); +} +void stream_emulator_get_memref_batch(void *stream, uint64_t *out_allocated, + uint64_t *out_aligned, + uint64_t out_offset, uint64_t out_size0, + uint64_t out_size1, uint64_t out_stride0, + uint64_t out_stride1) { + assert(0 && "Batched operations not implemented in the StreamEmulator."); +} + void *stream_emulator_init() { #ifdef CORNAMI_AVAILABLE // TODO: check/update against new info on Cornami API