mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-07 22:04:10 -05:00
feat(gpu): add memory management
This commit is contained in:
committed by
Andrei Stoian
parent
70fa68bf52
commit
846eed184e
13
Makefile
13
Makefile
@@ -994,6 +994,19 @@ test_high_level_api: install_rs_build_toolchain
|
||||
--features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p tfhe \
|
||||
-- high_level_api::
|
||||
|
||||
test_high_level_api_gpu_one: install_rs_build_toolchain install_cargo_nextest
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test \
|
||||
--features=integer,internal-keycache,gpu,zk-pok -p tfhe \
|
||||
-- --nocapture high_level_api::array::tests::booleans::test_gpu_only_bitand
|
||||
|
||||
#
|
||||
test_high_level_api_gpu_mul: install_rs_build_toolchain install_cargo_nextest
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test \
|
||||
--features=integer,internal-keycache,gpu-debug --profile release \
|
||||
-p tfhe \
|
||||
-- --nocapture integer::gpu::server_key::radix::tests_unsigned::test_mul:: \
|
||||
--test-threads=6
|
||||
|
||||
test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
|
||||
--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
|
||||
|
||||
@@ -5,6 +5,8 @@
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cuda_runtime.h>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
|
||||
#define CUDA_STREAM_POOL
|
||||
|
||||
@@ -83,13 +85,27 @@ void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);
|
||||
uint32_t cuda_is_available();
|
||||
|
||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
|
||||
void *cuda_ext_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index,
|
||||
uint64_t &size_tracker,
|
||||
bool allocate_gpu_memory);
|
||||
void *cuda_intern_malloc_with_size_tracking_async(uint64_t size,
|
||||
cudaStream_t stream,
|
||||
uint32_t gpu_index,
|
||||
uint64_t &size_tracker,
|
||||
bool allocate_gpu_memory,
|
||||
const char *file, int line);
|
||||
|
||||
void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
|
||||
#define cuda_malloc_with_size_tracking_async( \
|
||||
size, stream, gpu_index, size_tracker, allocate_gpu_memory) \
|
||||
cuda_intern_malloc_with_size_tracking_async( \
|
||||
size, stream, gpu_index, size_tracker, allocate_gpu_memory, __FILE__, \
|
||||
__LINE__)
|
||||
|
||||
void *cuda_int_malloc_async(uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index, const char *file, int line);
|
||||
#define cuda_malloc_async(size, stream, gpu_index) \
|
||||
cuda_int_malloc_async(size, stream, gpu_index, __FILE__, __LINE__)
|
||||
void *cuda_ext_malloc_async(uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index);
|
||||
|
||||
bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
uint64_t cuda_device_total_memory(uint32_t gpu_index);
|
||||
@@ -103,18 +119,28 @@ void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
|
||||
void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
void cuda_ext_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
void *dest, void const *src, uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index, bool gpu_memory_allocated);
|
||||
|
||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
void cuda_ext_memcpy_async_gpu_to_gpu(void *dest, void const *src,
|
||||
uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index);
|
||||
|
||||
void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
uint32_t gpu_index);
|
||||
void cuda_ext_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
uint32_t gpu_index);
|
||||
|
||||
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
void cuda_ext_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
|
||||
uint64_t size, cudaStream_t stream,
|
||||
@@ -123,6 +149,8 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
|
||||
|
||||
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
void cuda_ext_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
int cuda_get_number_of_gpus();
|
||||
|
||||
@@ -130,13 +158,26 @@ int cuda_get_number_of_sms();
|
||||
|
||||
void cuda_synchronize_device(uint32_t gpu_index);
|
||||
|
||||
void cuda_drop(void *ptr, uint32_t gpu_index);
|
||||
void cuda_int_drop(void *ptr, uint32_t gpu_index, const char *file, int line);
|
||||
#define cuda_drop(ptr, gpu_index) \
|
||||
cuda_int_drop(ptr, gpu_index, __FILE__, __LINE__)
|
||||
|
||||
void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
|
||||
uint32_t gpu_index,
|
||||
bool gpu_memory_allocated);
|
||||
void cuda_ext_drop(void *ptr, uint32_t gpu_index);
|
||||
|
||||
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
|
||||
void cuda_int_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
|
||||
uint32_t gpu_index,
|
||||
bool gpu_memory_allocated,
|
||||
const char *file, int line);
|
||||
|
||||
#define cuda_drop_with_size_tracking_async(ptr, stream, gpu_index, \
|
||||
gpu_memory_allocated) \
|
||||
cuda_int_drop_with_size_tracking_async( \
|
||||
ptr, stream, gpu_index, gpu_memory_allocated, __FILE__, __LINE__)
|
||||
|
||||
void cuda_int_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index,
|
||||
const char *file, int line);
|
||||
#define cuda_drop_async(ptr, stream, gpu_index) \
|
||||
cuda_int_drop_async(ptr, stream, gpu_index, __FILE__, __LINE__)
|
||||
}
|
||||
|
||||
uint32_t cuda_get_max_shared_memory(uint32_t gpu_index);
|
||||
|
||||
@@ -3,18 +3,33 @@
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
#include <deque>
|
||||
#include <unordered_map>
|
||||
#include <mutex>
|
||||
|
||||
#ifdef USE_NVTOOLS
|
||||
#include <cuda_profiler_api.h>
|
||||
#endif
|
||||
|
||||
#ifdef CUDA_STREAM_POOL
|
||||
#include <deque>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#endif
|
||||
|
||||
#include <bits/this_thread_sleep.h>
|
||||
|
||||
#define USE_MEMORY_MANAGER
|
||||
// #define DEBUG_MEMORY_MANAGER
|
||||
#define MAX_CACHE_SIZE (1 << 30)
|
||||
|
||||
#ifdef USE_MEMORY_MANAGER
|
||||
#include <list>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#endif
|
||||
|
||||
uint32_t cuda_get_device() {
|
||||
int device;
|
||||
check_cuda_error(cudaGetDevice(&device));
|
||||
@@ -99,6 +114,330 @@ void cuda_set_device(uint32_t gpu_index) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef USE_MEMORY_MANAGER
|
||||
|
||||
enum CudaMemBlockUsageType { CUDA_ALLOC = 0, MEMSET, MEMCPY_SRC, MEMCPY_DEST, FREE };
|
||||
|
||||
enum CudaAllocType { SYNC = 0, ASYNC };
|
||||
|
||||
#ifdef DEBUG_MEMORY_MANAGER
|
||||
struct CudaMemBlockUsage {
|
||||
std::string location;
|
||||
uint64_t timestamp;
|
||||
CudaMemBlockUsageType type;
|
||||
};
|
||||
#endif
|
||||
|
||||
struct CudaMemBlock {
|
||||
int8_t *ptr;
|
||||
uint64_t size;
|
||||
cudaStream_t stream;
|
||||
uint32_t gpu_index;
|
||||
size_t thread_id;
|
||||
CudaAllocType alloc_type;
|
||||
#ifdef DEBUG_MEMORY_MANAGER
|
||||
std::vector<CudaMemBlockUsage> usages;
|
||||
#endif
|
||||
};
|
||||
|
||||
class CudaMemoryManager {
|
||||
std::list<CudaMemBlock> cuda_allocs; // fresh allocs
|
||||
std::list<CudaMemBlock> cuda_freed; // freed for good
|
||||
|
||||
std::unordered_map<cudaStream_t,
|
||||
std::unordered_map<uint64_t, std::deque<CudaMemBlock>>>
|
||||
cache; // freed and re-used
|
||||
uint64_t cache_size = 0, peak_cache_size = 0;
|
||||
|
||||
std::mutex allocs_mutex;
|
||||
|
||||
#ifdef DEBUG_MEMORY_MANAGER
|
||||
std::string make_location(const char *file, int line) {
|
||||
std::stringstream sstr;
|
||||
sstr << file << ":" << line;
|
||||
return sstr.str();
|
||||
}
|
||||
uint64_t make_timestamp() {
|
||||
const std::chrono::time_point<std::chrono::system_clock> now =
|
||||
std::chrono::system_clock::now();
|
||||
|
||||
auto us = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
now.time_since_epoch())
|
||||
.count() %
|
||||
1000000;
|
||||
return us;
|
||||
}
|
||||
|
||||
void check_range_is_valid(CudaMemBlockUsageType usage_type, int8_t *dest,
|
||||
uint64_t size, cudaStream_t stream,
|
||||
const char *file, int line) {
|
||||
CudaMemBlockUsage usage = {make_location(file, line), make_timestamp(),
|
||||
usage_type};
|
||||
|
||||
const char *info = NULL;
|
||||
switch (usage_type) {
|
||||
case MEMSET:
|
||||
info = "memset";
|
||||
break;
|
||||
case MEMCPY_SRC:
|
||||
info = "memcpy source";
|
||||
break;
|
||||
case MEMCPY_DEST:
|
||||
info = "memcpy dest";
|
||||
break;
|
||||
default:
|
||||
info = "unknown";
|
||||
}
|
||||
|
||||
auto device_id = cuda_get_device();
|
||||
|
||||
bool found = false;
|
||||
for (auto it = cuda_allocs.begin(); it != cuda_allocs.end(); it++) {
|
||||
if (it->ptr == dest && it->gpu_index == device_id) {
|
||||
printf("%s with size tracking: found ptr %p\n", info, dest);
|
||||
if (size > it->size) {
|
||||
PANIC("%s OF %lu bytes TOO BIG TO %p OF SIZE %ld\n", info, size, dest,
|
||||
it->size);
|
||||
}
|
||||
it->usages.push_back(usage);
|
||||
found = true;
|
||||
} else {
|
||||
if (dest > it->ptr && dest < it->ptr + it->size &&
|
||||
it->gpu_index == device_id) {
|
||||
printf("%s with size tracking: indirect ptr %p in buffer %p\n", info,
|
||||
dest, it->ptr);
|
||||
if (dest + size > it->ptr + it->size) {
|
||||
auto remain_bytes = it->ptr + it->size - dest;
|
||||
PANIC("%s OF %lu bytes TOO BIG TO %p WHICH HAS ROOM ONLY FOR %d\n",
|
||||
info, size, dest, remain_bytes);
|
||||
}
|
||||
it->usages.push_back(usage);
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
PANIC("Cuda %s to %p of size %lu, unknown pointer", info, dest, size);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
public:
|
||||
void alloc(void **ptr, uint64_t size, CudaAllocType alloc_type,
|
||||
uint32_t gpu_index, cudaStream_t stream, const char *file,
|
||||
int line) {
|
||||
std::lock_guard<std::mutex> guard(allocs_mutex);
|
||||
|
||||
auto cache_of_stream = cache.find(stream);
|
||||
if (cache_of_stream != cache.end()) {
|
||||
auto cache_of_size = cache_of_stream->second.find(size);
|
||||
if (cache_of_size != cache_of_stream->second.end() &&
|
||||
!cache_of_size->second.empty()) {
|
||||
auto cached_alloc = cache_of_size->second.front();
|
||||
cache_of_size->second.pop_front();
|
||||
|
||||
// move to active allocs
|
||||
cuda_allocs.push_back(cached_alloc);
|
||||
*ptr = cached_alloc.ptr;
|
||||
|
||||
if (cache_size < size) {
|
||||
PANIC("INVALID CACHE USE!!");
|
||||
}
|
||||
|
||||
cache_size -= size;
|
||||
|
||||
#ifdef DEBUG_MEMORY_MANAGER
|
||||
printf("Cuda Allocation serviced from cache: %p of size %lu on gpu %d "
|
||||
"in %s\n",
|
||||
ptr, size, gpu_index, "");
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
cuda_set_device(gpu_index);
|
||||
if (alloc_type == SYNC) {
|
||||
check_cuda_error(cudaMalloc(ptr, size));
|
||||
} else if (alloc_type == ASYNC) {
|
||||
#ifndef CUDART_VERSION
|
||||
#error CUDART_VERSION Undefined!
|
||||
#elif (CUDART_VERSION >= 11020)
|
||||
int support_async_alloc;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
|
||||
|
||||
if (support_async_alloc) {
|
||||
check_cuda_error(cudaMallocAsync(ptr, size, stream));
|
||||
} else {
|
||||
check_cuda_error(cudaMalloc(ptr, size));
|
||||
}
|
||||
#else
|
||||
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
||||
#endif
|
||||
} else {
|
||||
PANIC("Invalid allocation mode");
|
||||
}
|
||||
|
||||
if (*ptr == nullptr) {
|
||||
if (size > 0) {
|
||||
PANIC("Allocation failed for %lu bytes, allocator returned %p", size,
|
||||
ptr);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
auto thread_id = std::hash<std::thread::id>{}(std::this_thread::get_id());
|
||||
CudaMemBlock block = {(int8_t *)*ptr, size, stream,
|
||||
gpu_index, thread_id, alloc_type};
|
||||
#ifdef DEBUG_MEMORY_MANAGER
|
||||
CudaMemBlockUsage usage = {make_location(file, line), make_timestamp(),
|
||||
CUDA_ALLOC};
|
||||
block.usages.push_back(usage);
|
||||
|
||||
printf("Cuda Allocated %p of size %lu on gpu %d in %s\n", ptr, size,
|
||||
gpu_index, usage.location.c_str());
|
||||
#endif
|
||||
|
||||
cuda_allocs.push_back(block);
|
||||
}
|
||||
void memset(int8_t *dest, uint64_t size, cudaStream_t stream,
|
||||
const char *file, int line) {
|
||||
#ifdef DEBUG_MEMORY_MANAGER
|
||||
std::lock_guard<std::mutex> guard(allocs_mutex);
|
||||
|
||||
check_range_is_valid(MEMSET, dest, size, stream, file, line);
|
||||
#endif
|
||||
}
|
||||
|
||||
void memcpy(int8_t *dest, int8_t *src, uint64_t size, cudaStream_t stream,
|
||||
const char *file, int line) {
|
||||
#ifdef DEBUG_MEMORY_MANAGER
|
||||
std::lock_guard<std::mutex> guard(allocs_mutex);
|
||||
|
||||
check_range_is_valid(MEMCPY_SRC, src, size, stream, file, line);
|
||||
check_range_is_valid(MEMCPY_DEST, src, size, stream, file, line);
|
||||
#endif
|
||||
}
|
||||
|
||||
void free(void *ptr, CudaAllocType alloc_type, uint32_t gpu_index,
|
||||
cudaStream_t stream, const char *file, int line) {
|
||||
if (ptr == nullptr)
|
||||
return;
|
||||
|
||||
std::lock_guard<std::mutex> guard(allocs_mutex);
|
||||
|
||||
bool found = false;
|
||||
bool must_free = false;
|
||||
|
||||
for (auto it = cuda_allocs.begin(); it != cuda_allocs.end(); it++) {
|
||||
if (it->ptr == ptr && it->gpu_index == gpu_index) {
|
||||
found = true;
|
||||
|
||||
if (cache_size + it->size < (MAX_CACHE_SIZE)) {
|
||||
cache[stream][it->size].push_back(*it);
|
||||
cache_size += it->size;
|
||||
if (peak_cache_size < cache_size) {
|
||||
peak_cache_size = cache_size;
|
||||
}
|
||||
} else {
|
||||
cuda_freed.push_back(*it);
|
||||
must_free = true;
|
||||
}
|
||||
#ifdef DEBUG_MEMORY_MANAGER
|
||||
printf("cuda dropped buffer %p of size %lu on gpu %d\n", ptr, it->size,
|
||||
gpu_index);
|
||||
#endif
|
||||
cuda_allocs.erase(it++);
|
||||
}
|
||||
}
|
||||
|
||||
if (must_free) {
|
||||
cuda_set_device(gpu_index);
|
||||
if (alloc_type == SYNC) {
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
} else if (alloc_type == ASYNC) {
|
||||
#ifndef CUDART_VERSION
|
||||
#error CUDART_VERSION Undefined!
|
||||
#elif (CUDART_VERSION >= 11020)
|
||||
int support_async_alloc;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
|
||||
|
||||
if (support_async_alloc) {
|
||||
check_cuda_error(cudaFreeAsync(ptr, stream));
|
||||
} else {
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
}
|
||||
#else
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEBUG_MEMORY_MANAGER
|
||||
if (!found) {
|
||||
for (auto it = cuda_freed.begin(); it != cuda_freed.end(); it++) {
|
||||
if (it->ptr == ptr && it->gpu_index == gpu_index) {
|
||||
found = true;
|
||||
printf("Drop in %s: %d\n", file, line);
|
||||
printf("Alloc in %s\n", it->usages[0].location.c_str());
|
||||
PANIC("cuda drop already dropped buffer %p of size %lu on gpu %d\n",
|
||||
ptr, it->size, gpu_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
PANIC("cuda drop unknown buffer %p\n", ptr);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
~CudaMemoryManager() {
|
||||
#ifdef DEBUG_MEMORY_MANAGER
|
||||
printf("%lu ALLOCATIONS AT PROGRAM EXIT\n", cuda_allocs.size());
|
||||
|
||||
for (auto &cuda_alloc : cuda_allocs) {
|
||||
printf("%p of size %lu allocated at %s\n", cuda_alloc.ptr,
|
||||
cuda_alloc.size, cuda_alloc.usages[0].location.c_str());
|
||||
}
|
||||
|
||||
printf("\n\n\n %llu PEAK CACHE SIZE\n", peak_cache_size);
|
||||
|
||||
for (auto &cache_for_size : cache) {
|
||||
for (auto &cuda_alloc : cache_for_size.second) {
|
||||
printf("%p of size %lu cached at %s\n", cuda_alloc.ptr, cuda_alloc.size,
|
||||
cuda_alloc.usages[0].location.c_str());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
class CudaMultiGPUMemoryManager {
|
||||
std::unordered_map<uint32_t, CudaMemoryManager> gMemManagers;
|
||||
std::mutex gMemManagersMutex; // for creation of the mem managers
|
||||
std::atomic<uint32_t> gMemManagerExists = 0;
|
||||
|
||||
public:
|
||||
CudaMemoryManager &get(uint32_t gpu_index) {
|
||||
if (gMemManagerExists.load() & (1 << gpu_index)) {
|
||||
return gMemManagers[gpu_index];
|
||||
} else {
|
||||
std::lock_guard<std::mutex> guard(gMemManagersMutex);
|
||||
uint32_t exist_flags = gMemManagerExists.load();
|
||||
if (!(exist_flags & (1 << gpu_index))) {
|
||||
gMemManagers[gpu_index]; // create it
|
||||
gMemManagerExists.store(exist_flags | (1 << gpu_index));
|
||||
}
|
||||
return gMemManagers[gpu_index];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
CudaMultiGPUMemoryManager gCudaMemoryManager;
|
||||
#endif
|
||||
|
||||
cudaEvent_t cuda_create_event(uint32_t gpu_index) {
|
||||
cuda_set_device(gpu_index);
|
||||
cudaEvent_t event;
|
||||
@@ -221,24 +560,38 @@ uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }
|
||||
/// or if there's not enough memory. A safe wrapper around it must call
|
||||
/// cuda_check_valid_malloc() first
|
||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
void *ptr = nullptr;
|
||||
#ifdef USE_MEMORY_MANAGER
|
||||
gCudaMemoryManager.get(gpu_index).alloc(&ptr, size, SYNC, gpu_index, 0,
|
||||
"rust_code", 0);
|
||||
#else
|
||||
cuda_set_device(gpu_index);
|
||||
void *ptr;
|
||||
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
||||
|
||||
#endif
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void *cuda_ext_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
return cuda_malloc(size, gpu_index);
|
||||
}
|
||||
|
||||
/// Allocates a size-byte array at the device memory. Tries to do it
|
||||
/// asynchronously.
|
||||
void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index,
|
||||
uint64_t &size_tracker,
|
||||
bool allocate_gpu_memory) {
|
||||
void *cuda_intern_malloc_with_size_tracking_async(uint64_t size,
|
||||
cudaStream_t stream,
|
||||
uint32_t gpu_index,
|
||||
uint64_t &size_tracker,
|
||||
bool allocate_gpu_memory,
|
||||
const char *file, int line) {
|
||||
size_tracker += size;
|
||||
void *ptr = nullptr;
|
||||
if (!allocate_gpu_memory)
|
||||
return ptr;
|
||||
|
||||
#ifdef USE_MEMORY_MANAGER
|
||||
gCudaMemoryManager.get(gpu_index).alloc(&ptr, size, ASYNC, gpu_index, stream,
|
||||
file, line);
|
||||
#else
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
#ifndef CUDART_VERSION
|
||||
@@ -256,16 +609,23 @@ void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
|
||||
#else
|
||||
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/// Allocates a size-byte array at the device memory. Tries to do it
|
||||
/// asynchronously.
|
||||
void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index) {
|
||||
void *cuda_int_malloc_async(uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index, const char *file, int line) {
|
||||
uint64_t size_tracker = 0;
|
||||
return cuda_malloc_with_size_tracking_async(size, stream, gpu_index,
|
||||
size_tracker, true);
|
||||
return cuda_intern_malloc_with_size_tracking_async(
|
||||
size, stream, gpu_index, size_tracker, true, file, line);
|
||||
}
|
||||
|
||||
void *cuda_ext_malloc_async(uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index) {
|
||||
return cuda_malloc_async(size, stream, gpu_index);
|
||||
}
|
||||
|
||||
/// Check that allocation is valid
|
||||
@@ -340,6 +700,11 @@ void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
|
||||
gpu_index, true);
|
||||
}
|
||||
|
||||
void cuda_ext_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index) {
|
||||
cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
|
||||
}
|
||||
|
||||
/// Copy memory within a GPU asynchronously
|
||||
void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
void *dest, void const *src, uint64_t size, cudaStream_t stream,
|
||||
@@ -372,6 +737,12 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
gpu_index, true);
|
||||
}
|
||||
|
||||
void cuda_ext_memcpy_async_gpu_to_gpu(void *dest, void const *src,
|
||||
uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index) {
|
||||
cuda_memcpy_async_gpu_to_gpu(dest, src, size, stream, gpu_index);
|
||||
}
|
||||
|
||||
/// Copy memory within a GPU
|
||||
void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
uint32_t gpu_index) {
|
||||
@@ -396,6 +767,11 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_ext_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
uint32_t gpu_index) {
|
||||
cuda_memcpy_gpu_to_gpu(dest, src, size, gpu_index);
|
||||
}
|
||||
|
||||
/// Synchronizes device
|
||||
void cuda_synchronize_device(uint32_t gpu_index) {
|
||||
cuda_set_device(gpu_index);
|
||||
@@ -408,6 +784,7 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
|
||||
bool gpu_memory_allocated) {
|
||||
if (size == 0 || !gpu_memory_allocated)
|
||||
return;
|
||||
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
|
||||
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
@@ -415,6 +792,7 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
|
||||
}
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
|
||||
gCudaMemoryManager.get(gpu_index).memset((int8_t *)dest, size, stream, "", 0);
|
||||
}
|
||||
|
||||
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
@@ -423,6 +801,11 @@ void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
true);
|
||||
}
|
||||
|
||||
void cuda_ext_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index) {
|
||||
cuda_memset_async(dest, val, size, stream, gpu_index);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
@@ -474,6 +857,11 @@ void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
|
||||
}
|
||||
|
||||
void cuda_ext_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index) {
|
||||
cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
|
||||
}
|
||||
|
||||
/// Return number of GPUs available
|
||||
int cuda_get_number_of_gpus() {
|
||||
int num_gpus;
|
||||
@@ -489,19 +877,31 @@ int cuda_get_number_of_sms() {
|
||||
}
|
||||
|
||||
/// Drop a cuda array
|
||||
void cuda_drop(void *ptr, uint32_t gpu_index) {
|
||||
void cuda_int_drop(void *ptr, uint32_t gpu_index, const char *file, int line) {
|
||||
#ifdef USE_MEMORY_MANAGER
|
||||
gCudaMemoryManager.get(gpu_index).free(ptr, SYNC, gpu_index, 0, file, line);
|
||||
#else
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
#endif
|
||||
}
|
||||
|
||||
void cuda_ext_drop(void *ptr, uint32_t gpu_index) { cuda_drop(ptr, gpu_index); }
|
||||
|
||||
/// Drop a cuda array asynchronously, if the data was allocated & it's supported
|
||||
/// on the device
|
||||
void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
|
||||
uint32_t gpu_index,
|
||||
bool gpu_memory_allocated) {
|
||||
void cuda_int_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
|
||||
uint32_t gpu_index,
|
||||
bool gpu_memory_allocated,
|
||||
const char *file, int line) {
|
||||
|
||||
if (!gpu_memory_allocated)
|
||||
return;
|
||||
|
||||
#ifdef USE_MEMORY_MANAGER
|
||||
gCudaMemoryManager.get(gpu_index).free(ptr, ASYNC, gpu_index, stream, file,
|
||||
line);
|
||||
#else
|
||||
cuda_set_device(gpu_index);
|
||||
#ifndef CUDART_VERSION
|
||||
#error CUDART_VERSION Undefined!
|
||||
@@ -518,11 +918,14 @@ void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
|
||||
#else
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Drop a cuda array asynchronously, if supported on the device
|
||||
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
|
||||
cuda_drop_with_size_tracking_async(ptr, stream, gpu_index, true);
|
||||
void cuda_int_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index,
|
||||
const char *file, int line) {
|
||||
cuda_int_drop_with_size_tracking_async(ptr, stream, gpu_index, true, file,
|
||||
line);
|
||||
}
|
||||
|
||||
/// Get the maximum size for the shared memory per streaming multiprocessors
|
||||
|
||||
@@ -11,21 +11,13 @@ extern "C" {
|
||||
|
||||
pub fn cuda_is_available() -> u32;
|
||||
|
||||
pub fn cuda_malloc(size: u64, gpu_index: u32) -> *mut c_void;
|
||||
pub fn cuda_ext_malloc(size: u64, gpu_index: u32) -> *mut c_void;
|
||||
|
||||
pub fn cuda_malloc_with_size_tracking_async(
|
||||
size: u64,
|
||||
stream: *mut c_void,
|
||||
gpu_index: u32,
|
||||
size_tracker: *mut u64,
|
||||
allocate_gpu_memory: bool,
|
||||
) -> *mut c_void;
|
||||
|
||||
pub fn cuda_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void;
|
||||
pub fn cuda_ext_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void;
|
||||
pub fn cuda_check_valid_malloc(size: u64, gpu_index: u32) -> bool;
|
||||
pub fn cuda_device_total_memory(gpu_index: u32) -> u64;
|
||||
|
||||
pub fn cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
pub fn cuda_ext_memcpy_with_size_tracking_async_to_gpu(
|
||||
dest: *mut c_void,
|
||||
src: *const c_void,
|
||||
size: u64,
|
||||
@@ -34,7 +26,7 @@ extern "C" {
|
||||
gpu_memory_allocated: bool,
|
||||
);
|
||||
|
||||
pub fn cuda_memcpy_async_to_gpu(
|
||||
pub fn cuda_ext_memcpy_async_to_gpu(
|
||||
dest: *mut c_void,
|
||||
src: *const c_void,
|
||||
size: u64,
|
||||
@@ -42,9 +34,14 @@ extern "C" {
|
||||
gpu_index: u32,
|
||||
);
|
||||
|
||||
pub fn cuda_memcpy_gpu_to_gpu(dest: *mut c_void, src: *const c_void, size: u64, gpu_index: u32);
|
||||
pub fn cuda_ext_memcpy_gpu_to_gpu(
|
||||
dest: *mut c_void,
|
||||
src: *const c_void,
|
||||
size: u64,
|
||||
gpu_index: u32,
|
||||
);
|
||||
|
||||
pub fn cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
pub fn cuda_ext_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
dest: *mut c_void,
|
||||
src: *const c_void,
|
||||
size: u64,
|
||||
@@ -53,7 +50,7 @@ extern "C" {
|
||||
gpu_memory_allocated: bool,
|
||||
);
|
||||
|
||||
pub fn cuda_memcpy_async_gpu_to_gpu(
|
||||
pub fn cuda_ext_memcpy_async_gpu_to_gpu(
|
||||
dest: *mut c_void,
|
||||
src: *const c_void,
|
||||
size: u64,
|
||||
@@ -61,7 +58,7 @@ extern "C" {
|
||||
gpu_index: u32,
|
||||
);
|
||||
|
||||
pub fn cuda_memcpy_async_to_cpu(
|
||||
pub fn cuda_ext_memcpy_async_to_cpu(
|
||||
dest: *mut c_void,
|
||||
src: *const c_void,
|
||||
size: u64,
|
||||
@@ -78,7 +75,7 @@ extern "C" {
|
||||
gpu_memory_allocated: bool,
|
||||
);
|
||||
|
||||
pub fn cuda_memset_async(
|
||||
pub fn cuda_ext_memset_async(
|
||||
dest: *mut c_void,
|
||||
val: u64,
|
||||
size: u64,
|
||||
@@ -92,7 +89,7 @@ extern "C" {
|
||||
|
||||
pub fn cuda_synchronize_device(gpu_index: u32);
|
||||
|
||||
pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32);
|
||||
pub fn cuda_ext_drop(ptr: *mut c_void, gpu_index: u32);
|
||||
|
||||
pub fn cuda_drop_with_size_tracking_async(
|
||||
ptr: *mut c_void,
|
||||
@@ -102,7 +99,7 @@ extern "C" {
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
|
||||
pub fn cuda_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32);
|
||||
pub fn cuda_ext_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32);
|
||||
|
||||
pub fn cuda_setup_multi_gpu(gpu_index: u32) -> i32;
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::core_crypto::prelude::{
|
||||
CiphertextModulus, Container, LweCiphertext, LweCiphertextCount, LweCiphertextList,
|
||||
LweDimension, LweSize, UnsignedInteger,
|
||||
};
|
||||
use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_gpu_to_gpu;
|
||||
use tfhe_cuda_backend::cuda_bind::cuda_ext_memcpy_async_gpu_to_gpu;
|
||||
|
||||
/// A structure representing a vector of LWE ciphertexts with 64 bits of precision on the GPU.
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -123,7 +123,7 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
|
||||
* std::mem::size_of::<T>();
|
||||
// Concatenate gpu_index memory
|
||||
unsafe {
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
cuda_ext_memcpy_async_gpu_to_gpu(
|
||||
ptr,
|
||||
first_item.0.d_vec.as_c_ptr(0),
|
||||
size as u64,
|
||||
@@ -132,7 +132,7 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
|
||||
);
|
||||
ptr = ptr.wrapping_byte_add(size);
|
||||
for list in cuda_ciphertexts_list_vec {
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
cuda_ext_memcpy_async_gpu_to_gpu(
|
||||
ptr,
|
||||
list.0.d_vec.as_c_ptr(0),
|
||||
size as u64,
|
||||
|
||||
@@ -3,7 +3,9 @@ use crate::core_crypto::gpu::CudaStreams;
|
||||
use crate::core_crypto::prelude::Numeric;
|
||||
use std::ffi::c_void;
|
||||
use std::marker::PhantomData;
|
||||
use tfhe_cuda_backend::cuda_bind::{cuda_memcpy_async_gpu_to_gpu, cuda_memcpy_async_to_cpu};
|
||||
use tfhe_cuda_backend::cuda_bind::{
|
||||
cuda_ext_memcpy_async_gpu_to_gpu, cuda_ext_memcpy_async_to_cpu,
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CudaSlice<'a, T: Numeric> {
|
||||
@@ -98,7 +100,7 @@ where
|
||||
let size = src.len(index) * std::mem::size_of::<T>();
|
||||
// We check that src is not empty to avoid invalid pointers
|
||||
if size > 0 {
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
cuda_ext_memcpy_async_gpu_to_gpu(
|
||||
self.as_mut_c_ptr(index),
|
||||
src.as_c_ptr(index),
|
||||
size as u64,
|
||||
@@ -123,7 +125,7 @@ where
|
||||
let size = self.len(index) * std::mem::size_of::<T>();
|
||||
// We check that src is not empty to avoid invalid pointers
|
||||
if size > 0 {
|
||||
cuda_memcpy_async_to_cpu(
|
||||
cuda_ext_memcpy_async_to_cpu(
|
||||
dest.as_mut_ptr().cast::<c_void>(),
|
||||
self.as_c_ptr(index),
|
||||
size as u64,
|
||||
|
||||
@@ -6,9 +6,9 @@ use std::collections::Bound::{Excluded, Included, Unbounded};
|
||||
use std::ffi::c_void;
|
||||
use std::marker::PhantomData;
|
||||
use tfhe_cuda_backend::cuda_bind::{
|
||||
cuda_drop, cuda_malloc, cuda_malloc_async, cuda_memcpy_async_gpu_to_gpu,
|
||||
cuda_memcpy_async_to_cpu, cuda_memcpy_async_to_gpu, cuda_memcpy_gpu_to_gpu, cuda_memset_async,
|
||||
cuda_synchronize_device,
|
||||
cuda_ext_drop, cuda_ext_malloc, cuda_ext_malloc_async, cuda_ext_memcpy_async_gpu_to_gpu,
|
||||
cuda_ext_memcpy_async_to_cpu, cuda_ext_memcpy_async_to_gpu, cuda_ext_memcpy_gpu_to_gpu,
|
||||
cuda_ext_memset_async, cuda_synchronize_device,
|
||||
};
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
@@ -74,8 +74,8 @@ impl<T: Numeric> Clone for CudaVec<T> {
|
||||
for (index, &gpu_index) in self.gpu_indexes.iter().enumerate() {
|
||||
unsafe {
|
||||
cuda_synchronize_device(gpu_index.0);
|
||||
let ptr = cuda_malloc(size, gpu_index.0);
|
||||
cuda_memcpy_gpu_to_gpu(ptr, self.ptr[index], size, gpu_index.0);
|
||||
let ptr = cuda_ext_malloc(size, gpu_index.0);
|
||||
cuda_ext_memcpy_gpu_to_gpu(ptr, self.ptr[index], size, gpu_index.0);
|
||||
cloned_vec.push(ptr);
|
||||
}
|
||||
}
|
||||
@@ -101,12 +101,12 @@ impl<T: Numeric> CudaVec<T> {
|
||||
/// - `streams` __must__ be synchronized to guarantee computation has finished
|
||||
pub unsafe fn new_async(len: usize, streams: &CudaStreams, stream_index: u32) -> Self {
|
||||
let size = len as u64 * std::mem::size_of::<T>() as u64;
|
||||
let ptr = cuda_malloc_async(
|
||||
let ptr = cuda_ext_malloc_async(
|
||||
size,
|
||||
streams.ptr[stream_index as usize],
|
||||
streams.gpu_indexes[stream_index as usize].0,
|
||||
);
|
||||
cuda_memset_async(
|
||||
cuda_ext_memset_async(
|
||||
ptr,
|
||||
0u64,
|
||||
size,
|
||||
@@ -129,9 +129,9 @@ impl<T: Numeric> CudaVec<T> {
|
||||
let mut ptrs = Vec::with_capacity(streams.len());
|
||||
for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
|
||||
let gpu_index = streams.gpu_indexes[i];
|
||||
let ptr = unsafe { cuda_malloc_async(size, stream_ptr, gpu_index.0) };
|
||||
let ptr = unsafe { cuda_ext_malloc_async(size, stream_ptr, gpu_index.0) };
|
||||
unsafe {
|
||||
cuda_memset_async(ptr, 0u64, size, stream_ptr, gpu_index.0);
|
||||
cuda_ext_memset_async(ptr, 0u64, size, stream_ptr, gpu_index.0);
|
||||
}
|
||||
streams.synchronize_one(i as u32);
|
||||
ptrs.push(ptr);
|
||||
@@ -179,7 +179,7 @@ impl<T: Numeric> CudaVec<T> {
|
||||
let size = self.len() * std::mem::size_of::<T>();
|
||||
// We check that self is not empty to avoid invalid pointers
|
||||
if size > 0 {
|
||||
cuda_memset_async(
|
||||
cuda_ext_memset_async(
|
||||
self.as_mut_c_ptr(stream_index),
|
||||
value,
|
||||
size as u64,
|
||||
@@ -209,7 +209,7 @@ impl<T: Numeric> CudaVec<T> {
|
||||
// We have to check that src is not empty, because Rust slice with size 0 results in an
|
||||
// invalid pointer being passed to copy_to_gpu_async
|
||||
if size > 0 {
|
||||
cuda_memcpy_async_to_gpu(
|
||||
cuda_ext_memcpy_async_to_gpu(
|
||||
self.as_mut_c_ptr(stream_index),
|
||||
src.as_ptr().cast(),
|
||||
size as u64,
|
||||
@@ -237,7 +237,7 @@ impl<T: Numeric> CudaVec<T> {
|
||||
// invalid pointer being passed to copy_to_gpu_async
|
||||
if size > 0 {
|
||||
let gpu_index = streams.gpu_indexes[i];
|
||||
cuda_memcpy_async_to_gpu(
|
||||
cuda_ext_memcpy_async_to_gpu(
|
||||
self.get_mut_c_ptr(i as u32),
|
||||
src.as_ptr().cast(),
|
||||
size as u64,
|
||||
@@ -266,7 +266,7 @@ impl<T: Numeric> CudaVec<T> {
|
||||
let size = src.len() * std::mem::size_of::<T>();
|
||||
// We check that src is not empty to avoid invalid pointers
|
||||
if size > 0 {
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
cuda_ext_memcpy_async_gpu_to_gpu(
|
||||
self.as_mut_c_ptr(stream_index),
|
||||
src.as_c_ptr(stream_index),
|
||||
size as u64,
|
||||
@@ -305,7 +305,7 @@ impl<T: Numeric> CudaVec<T> {
|
||||
.as_c_ptr(stream_index)
|
||||
.add(start * std::mem::size_of::<T>());
|
||||
let size = (end - start + 1) * std::mem::size_of::<T>();
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
cuda_ext_memcpy_async_gpu_to_gpu(
|
||||
self.as_mut_c_ptr(stream_index),
|
||||
src_ptr,
|
||||
size as u64,
|
||||
@@ -342,7 +342,7 @@ impl<T: Numeric> CudaVec<T> {
|
||||
.as_mut_c_ptr(stream_index)
|
||||
.add(start * std::mem::size_of::<T>());
|
||||
let size = (end - start + 1) * std::mem::size_of::<T>();
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
cuda_ext_memcpy_async_gpu_to_gpu(
|
||||
dest_ptr,
|
||||
src.as_c_ptr(stream_index),
|
||||
size as u64,
|
||||
@@ -366,7 +366,7 @@ impl<T: Numeric> CudaVec<T> {
|
||||
// We have to check that self is not empty, because Rust slice with size 0 results in an
|
||||
// invalid pointer being passed to copy_to_cpu_async
|
||||
if size > 0 {
|
||||
cuda_memcpy_async_to_cpu(
|
||||
cuda_ext_memcpy_async_to_cpu(
|
||||
dest.as_mut_ptr().cast(),
|
||||
self.as_c_ptr(stream_index),
|
||||
size as u64,
|
||||
@@ -484,7 +484,7 @@ impl<T: Numeric> Drop for CudaVec<T> {
|
||||
// Synchronizes the device to be sure no stream is still using this pointer
|
||||
let gpu_index = self.gpu_indexes[i];
|
||||
synchronize_device(gpu_index.0);
|
||||
unsafe { cuda_drop(ptr, gpu_index.0) };
|
||||
unsafe { cuda_ext_drop(ptr, gpu_index.0) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ use crate::shortint::{AtomicPatternKind, CarryModulus, Ciphertext, MessageModulu
|
||||
use crate::GpuIndex;
|
||||
use itertools::Itertools;
|
||||
use serde::Deserializer;
|
||||
use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_to_gpu;
|
||||
use tfhe_cuda_backend::cuda_bind::cuda_ext_memcpy_async_to_gpu;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct CudaCompactCiphertextListInfo {
|
||||
@@ -271,7 +271,7 @@ impl CudaFlattenedVecCompactCiphertextList {
|
||||
let dest_ptr = d_flattened_d_vec
|
||||
.as_mut_c_ptr(0)
|
||||
.add(offset * std::mem::size_of::<u64>());
|
||||
cuda_memcpy_async_to_gpu(
|
||||
cuda_ext_memcpy_async_to_gpu(
|
||||
dest_ptr,
|
||||
container.as_ptr().cast(),
|
||||
(expected_length * std::mem::size_of::<u64>()) as u64,
|
||||
|
||||
@@ -15,7 +15,7 @@ use crate::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphe
|
||||
use crate::integer::gpu::ciphertext::CudaRadixCiphertext;
|
||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{
|
||||
compress_integer_radix_async, cuda_memcpy_async_gpu_to_gpu, decompress_integer_radix_async_64,
|
||||
compress_integer_radix_async, cuda_ext_memcpy_async_gpu_to_gpu, decompress_integer_radix_async_64,
|
||||
get_compression_size_on_gpu, get_decompression_size_on_gpu,
|
||||
};
|
||||
use crate::prelude::CastInto;
|
||||
@@ -248,7 +248,7 @@ impl CudaCompressionKey {
|
||||
.as_mut_c_ptr(0)
|
||||
.add(offset * std::mem::size_of::<u64>());
|
||||
let size = ciphertext.d_blocks.0.d_vec.len * std::mem::size_of::<u64>();
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
cuda_ext_memcpy_async_gpu_to_gpu(
|
||||
dest_ptr,
|
||||
ciphertext.d_blocks.0.d_vec.as_c_ptr(0),
|
||||
size as u64,
|
||||
@@ -577,7 +577,7 @@ impl CudaNoiseSquashingCompressionKey {
|
||||
.as_mut_c_ptr(0)
|
||||
.add(offset * std::mem::size_of::<u128>());
|
||||
let size = ciphertext.packed_d_blocks.0.d_vec.len * std::mem::size_of::<u128>();
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
cuda_ext_memcpy_async_gpu_to_gpu(
|
||||
dest_ptr,
|
||||
ciphertext.packed_d_blocks.0.d_vec.as_c_ptr(0),
|
||||
size as u64,
|
||||
|
||||
Reference in New Issue
Block a user