mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-08 22:28:01 -05:00
feat(gpu): add memory management
This commit is contained in:
committed by
Andrei Stoian
parent
70fa68bf52
commit
846eed184e
13
Makefile
13
Makefile
@@ -994,6 +994,19 @@ test_high_level_api: install_rs_build_toolchain
|
|||||||
--features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p tfhe \
|
--features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p tfhe \
|
||||||
-- high_level_api::
|
-- high_level_api::
|
||||||
|
|
||||||
|
test_high_level_api_gpu_one: install_rs_build_toolchain install_cargo_nextest
|
||||||
|
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test \
|
||||||
|
--features=integer,internal-keycache,gpu,zk-pok -p tfhe \
|
||||||
|
-- --nocapture high_level_api::array::tests::booleans::test_gpu_only_bitand
|
||||||
|
|
||||||
|
#
|
||||||
|
test_high_level_api_gpu_mul: install_rs_build_toolchain install_cargo_nextest
|
||||||
|
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test \
|
||||||
|
--features=integer,internal-keycache,gpu-debug --profile release \
|
||||||
|
-p tfhe \
|
||||||
|
-- --nocapture integer::gpu::server_key::radix::tests_unsigned::test_mul:: \
|
||||||
|
--test-threads=6
|
||||||
|
|
||||||
test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
|
test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
|
||||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
|
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
|
||||||
--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
|
--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
|
||||||
|
|||||||
@@ -5,6 +5,8 @@
|
|||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
#include <fstream>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#define CUDA_STREAM_POOL
|
#define CUDA_STREAM_POOL
|
||||||
|
|
||||||
@@ -83,13 +85,27 @@ void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);
|
|||||||
uint32_t cuda_is_available();
|
uint32_t cuda_is_available();
|
||||||
|
|
||||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
|
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
|
||||||
|
void *cuda_ext_malloc(uint64_t size, uint32_t gpu_index);
|
||||||
|
|
||||||
void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
|
void *cuda_intern_malloc_with_size_tracking_async(uint64_t size,
|
||||||
uint32_t gpu_index,
|
cudaStream_t stream,
|
||||||
uint64_t &size_tracker,
|
uint32_t gpu_index,
|
||||||
bool allocate_gpu_memory);
|
uint64_t &size_tracker,
|
||||||
|
bool allocate_gpu_memory,
|
||||||
|
const char *file, int line);
|
||||||
|
|
||||||
void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
|
#define cuda_malloc_with_size_tracking_async( \
|
||||||
|
size, stream, gpu_index, size_tracker, allocate_gpu_memory) \
|
||||||
|
cuda_intern_malloc_with_size_tracking_async( \
|
||||||
|
size, stream, gpu_index, size_tracker, allocate_gpu_memory, __FILE__, \
|
||||||
|
__LINE__)
|
||||||
|
|
||||||
|
void *cuda_int_malloc_async(uint64_t size, cudaStream_t stream,
|
||||||
|
uint32_t gpu_index, const char *file, int line);
|
||||||
|
#define cuda_malloc_async(size, stream, gpu_index) \
|
||||||
|
cuda_int_malloc_async(size, stream, gpu_index, __FILE__, __LINE__)
|
||||||
|
void *cuda_ext_malloc_async(uint64_t size, cudaStream_t stream,
|
||||||
|
uint32_t gpu_index);
|
||||||
|
|
||||||
bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||||
uint64_t cuda_device_total_memory(uint32_t gpu_index);
|
uint64_t cuda_device_total_memory(uint32_t gpu_index);
|
||||||
@@ -103,18 +119,28 @@ void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
|
|||||||
void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
|
void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
|
||||||
cudaStream_t stream, uint32_t gpu_index);
|
cudaStream_t stream, uint32_t gpu_index);
|
||||||
|
|
||||||
|
void cuda_ext_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
|
||||||
|
cudaStream_t stream, uint32_t gpu_index);
|
||||||
|
|
||||||
void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||||
void *dest, void const *src, uint64_t size, cudaStream_t stream,
|
void *dest, void const *src, uint64_t size, cudaStream_t stream,
|
||||||
uint32_t gpu_index, bool gpu_memory_allocated);
|
uint32_t gpu_index, bool gpu_memory_allocated);
|
||||||
|
|
||||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||||
cudaStream_t stream, uint32_t gpu_index);
|
cudaStream_t stream, uint32_t gpu_index);
|
||||||
|
void cuda_ext_memcpy_async_gpu_to_gpu(void *dest, void const *src,
|
||||||
|
uint64_t size, cudaStream_t stream,
|
||||||
|
uint32_t gpu_index);
|
||||||
|
|
||||||
void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||||
uint32_t gpu_index);
|
uint32_t gpu_index);
|
||||||
|
void cuda_ext_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||||
|
uint32_t gpu_index);
|
||||||
|
|
||||||
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||||
cudaStream_t stream, uint32_t gpu_index);
|
cudaStream_t stream, uint32_t gpu_index);
|
||||||
|
void cuda_ext_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||||
|
cudaStream_t stream, uint32_t gpu_index);
|
||||||
|
|
||||||
void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
|
void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
|
||||||
uint64_t size, cudaStream_t stream,
|
uint64_t size, cudaStream_t stream,
|
||||||
@@ -123,6 +149,8 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
|
|||||||
|
|
||||||
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||||
cudaStream_t stream, uint32_t gpu_index);
|
cudaStream_t stream, uint32_t gpu_index);
|
||||||
|
void cuda_ext_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||||
|
cudaStream_t stream, uint32_t gpu_index);
|
||||||
|
|
||||||
int cuda_get_number_of_gpus();
|
int cuda_get_number_of_gpus();
|
||||||
|
|
||||||
@@ -130,13 +158,26 @@ int cuda_get_number_of_sms();
|
|||||||
|
|
||||||
void cuda_synchronize_device(uint32_t gpu_index);
|
void cuda_synchronize_device(uint32_t gpu_index);
|
||||||
|
|
||||||
void cuda_drop(void *ptr, uint32_t gpu_index);
|
void cuda_int_drop(void *ptr, uint32_t gpu_index, const char *file, int line);
|
||||||
|
#define cuda_drop(ptr, gpu_index) \
|
||||||
|
cuda_int_drop(ptr, gpu_index, __FILE__, __LINE__)
|
||||||
|
|
||||||
void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
|
void cuda_ext_drop(void *ptr, uint32_t gpu_index);
|
||||||
uint32_t gpu_index,
|
|
||||||
bool gpu_memory_allocated);
|
|
||||||
|
|
||||||
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
|
void cuda_int_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
|
||||||
|
uint32_t gpu_index,
|
||||||
|
bool gpu_memory_allocated,
|
||||||
|
const char *file, int line);
|
||||||
|
|
||||||
|
#define cuda_drop_with_size_tracking_async(ptr, stream, gpu_index, \
|
||||||
|
gpu_memory_allocated) \
|
||||||
|
cuda_int_drop_with_size_tracking_async( \
|
||||||
|
ptr, stream, gpu_index, gpu_memory_allocated, __FILE__, __LINE__)
|
||||||
|
|
||||||
|
void cuda_int_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index,
|
||||||
|
const char *file, int line);
|
||||||
|
#define cuda_drop_async(ptr, stream, gpu_index) \
|
||||||
|
cuda_int_drop_async(ptr, stream, gpu_index, __FILE__, __LINE__)
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t cuda_get_max_shared_memory(uint32_t gpu_index);
|
uint32_t cuda_get_max_shared_memory(uint32_t gpu_index);
|
||||||
|
|||||||
@@ -3,18 +3,33 @@
|
|||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
#include <deque>
|
||||||
|
#include <unordered_map>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
|
||||||
#ifdef USE_NVTOOLS
|
#ifdef USE_NVTOOLS
|
||||||
#include <cuda_profiler_api.h>
|
#include <cuda_profiler_api.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef CUDA_STREAM_POOL
|
#ifdef CUDA_STREAM_POOL
|
||||||
#include <deque>
|
#include <deque>
|
||||||
#include <mutex>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include <bits/this_thread_sleep.h>
|
||||||
|
|
||||||
|
#define USE_MEMORY_MANAGER
|
||||||
|
// #define DEBUG_MEMORY_MANAGER
|
||||||
|
#define MAX_CACHE_SIZE (1 << 30)
|
||||||
|
|
||||||
|
#ifdef USE_MEMORY_MANAGER
|
||||||
|
#include <list>
|
||||||
|
#include <sstream>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
#endif
|
||||||
|
|
||||||
uint32_t cuda_get_device() {
|
uint32_t cuda_get_device() {
|
||||||
int device;
|
int device;
|
||||||
check_cuda_error(cudaGetDevice(&device));
|
check_cuda_error(cudaGetDevice(&device));
|
||||||
@@ -99,6 +114,330 @@ void cuda_set_device(uint32_t gpu_index) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef USE_MEMORY_MANAGER
|
||||||
|
|
||||||
|
enum CudaMemBlockUsageType { CUDA_ALLOC = 0, MEMSET, MEMCPY_SRC, MEMCPY_DEST, FREE };
|
||||||
|
|
||||||
|
enum CudaAllocType { SYNC = 0, ASYNC };
|
||||||
|
|
||||||
|
#ifdef DEBUG_MEMORY_MANAGER
|
||||||
|
struct CudaMemBlockUsage {
|
||||||
|
std::string location;
|
||||||
|
uint64_t timestamp;
|
||||||
|
CudaMemBlockUsageType type;
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct CudaMemBlock {
|
||||||
|
int8_t *ptr;
|
||||||
|
uint64_t size;
|
||||||
|
cudaStream_t stream;
|
||||||
|
uint32_t gpu_index;
|
||||||
|
size_t thread_id;
|
||||||
|
CudaAllocType alloc_type;
|
||||||
|
#ifdef DEBUG_MEMORY_MANAGER
|
||||||
|
std::vector<CudaMemBlockUsage> usages;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
class CudaMemoryManager {
|
||||||
|
std::list<CudaMemBlock> cuda_allocs; // fresh allocs
|
||||||
|
std::list<CudaMemBlock> cuda_freed; // freed for good
|
||||||
|
|
||||||
|
std::unordered_map<cudaStream_t,
|
||||||
|
std::unordered_map<uint64_t, std::deque<CudaMemBlock>>>
|
||||||
|
cache; // freed and re-used
|
||||||
|
uint64_t cache_size = 0, peak_cache_size = 0;
|
||||||
|
|
||||||
|
std::mutex allocs_mutex;
|
||||||
|
|
||||||
|
#ifdef DEBUG_MEMORY_MANAGER
|
||||||
|
std::string make_location(const char *file, int line) {
|
||||||
|
std::stringstream sstr;
|
||||||
|
sstr << file << ":" << line;
|
||||||
|
return sstr.str();
|
||||||
|
}
|
||||||
|
uint64_t make_timestamp() {
|
||||||
|
const std::chrono::time_point<std::chrono::system_clock> now =
|
||||||
|
std::chrono::system_clock::now();
|
||||||
|
|
||||||
|
auto us = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||||
|
now.time_since_epoch())
|
||||||
|
.count() %
|
||||||
|
1000000;
|
||||||
|
return us;
|
||||||
|
}
|
||||||
|
|
||||||
|
void check_range_is_valid(CudaMemBlockUsageType usage_type, int8_t *dest,
|
||||||
|
uint64_t size, cudaStream_t stream,
|
||||||
|
const char *file, int line) {
|
||||||
|
CudaMemBlockUsage usage = {make_location(file, line), make_timestamp(),
|
||||||
|
usage_type};
|
||||||
|
|
||||||
|
const char *info = NULL;
|
||||||
|
switch (usage_type) {
|
||||||
|
case MEMSET:
|
||||||
|
info = "memset";
|
||||||
|
break;
|
||||||
|
case MEMCPY_SRC:
|
||||||
|
info = "memcpy source";
|
||||||
|
break;
|
||||||
|
case MEMCPY_DEST:
|
||||||
|
info = "memcpy dest";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
info = "unknown";
|
||||||
|
}
|
||||||
|
|
||||||
|
auto device_id = cuda_get_device();
|
||||||
|
|
||||||
|
bool found = false;
|
||||||
|
for (auto it = cuda_allocs.begin(); it != cuda_allocs.end(); it++) {
|
||||||
|
if (it->ptr == dest && it->gpu_index == device_id) {
|
||||||
|
printf("%s with size tracking: found ptr %p\n", info, dest);
|
||||||
|
if (size > it->size) {
|
||||||
|
PANIC("%s OF %lu bytes TOO BIG TO %p OF SIZE %ld\n", info, size, dest,
|
||||||
|
it->size);
|
||||||
|
}
|
||||||
|
it->usages.push_back(usage);
|
||||||
|
found = true;
|
||||||
|
} else {
|
||||||
|
if (dest > it->ptr && dest < it->ptr + it->size &&
|
||||||
|
it->gpu_index == device_id) {
|
||||||
|
printf("%s with size tracking: indirect ptr %p in buffer %p\n", info,
|
||||||
|
dest, it->ptr);
|
||||||
|
if (dest + size > it->ptr + it->size) {
|
||||||
|
auto remain_bytes = it->ptr + it->size - dest;
|
||||||
|
PANIC("%s OF %lu bytes TOO BIG TO %p WHICH HAS ROOM ONLY FOR %d\n",
|
||||||
|
info, size, dest, remain_bytes);
|
||||||
|
}
|
||||||
|
it->usages.push_back(usage);
|
||||||
|
found = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!found) {
|
||||||
|
PANIC("Cuda %s to %p of size %lu, unknown pointer", info, dest, size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
public:
|
||||||
|
void alloc(void **ptr, uint64_t size, CudaAllocType alloc_type,
|
||||||
|
uint32_t gpu_index, cudaStream_t stream, const char *file,
|
||||||
|
int line) {
|
||||||
|
std::lock_guard<std::mutex> guard(allocs_mutex);
|
||||||
|
|
||||||
|
auto cache_of_stream = cache.find(stream);
|
||||||
|
if (cache_of_stream != cache.end()) {
|
||||||
|
auto cache_of_size = cache_of_stream->second.find(size);
|
||||||
|
if (cache_of_size != cache_of_stream->second.end() &&
|
||||||
|
!cache_of_size->second.empty()) {
|
||||||
|
auto cached_alloc = cache_of_size->second.front();
|
||||||
|
cache_of_size->second.pop_front();
|
||||||
|
|
||||||
|
// move to active allocs
|
||||||
|
cuda_allocs.push_back(cached_alloc);
|
||||||
|
*ptr = cached_alloc.ptr;
|
||||||
|
|
||||||
|
if (cache_size < size) {
|
||||||
|
PANIC("INVALID CACHE USE!!");
|
||||||
|
}
|
||||||
|
|
||||||
|
cache_size -= size;
|
||||||
|
|
||||||
|
#ifdef DEBUG_MEMORY_MANAGER
|
||||||
|
printf("Cuda Allocation serviced from cache: %p of size %lu on gpu %d "
|
||||||
|
"in %s\n",
|
||||||
|
ptr, size, gpu_index, "");
|
||||||
|
#endif
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cuda_set_device(gpu_index);
|
||||||
|
if (alloc_type == SYNC) {
|
||||||
|
check_cuda_error(cudaMalloc(ptr, size));
|
||||||
|
} else if (alloc_type == ASYNC) {
|
||||||
|
#ifndef CUDART_VERSION
|
||||||
|
#error CUDART_VERSION Undefined!
|
||||||
|
#elif (CUDART_VERSION >= 11020)
|
||||||
|
int support_async_alloc;
|
||||||
|
check_cuda_error(cudaDeviceGetAttribute(
|
||||||
|
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
|
||||||
|
|
||||||
|
if (support_async_alloc) {
|
||||||
|
check_cuda_error(cudaMallocAsync(ptr, size, stream));
|
||||||
|
} else {
|
||||||
|
check_cuda_error(cudaMalloc(ptr, size));
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
PANIC("Invalid allocation mode");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (*ptr == nullptr) {
|
||||||
|
if (size > 0) {
|
||||||
|
PANIC("Allocation failed for %lu bytes, allocator returned %p", size,
|
||||||
|
ptr);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto thread_id = std::hash<std::thread::id>{}(std::this_thread::get_id());
|
||||||
|
CudaMemBlock block = {(int8_t *)*ptr, size, stream,
|
||||||
|
gpu_index, thread_id, alloc_type};
|
||||||
|
#ifdef DEBUG_MEMORY_MANAGER
|
||||||
|
CudaMemBlockUsage usage = {make_location(file, line), make_timestamp(),
|
||||||
|
CUDA_ALLOC};
|
||||||
|
block.usages.push_back(usage);
|
||||||
|
|
||||||
|
printf("Cuda Allocated %p of size %lu on gpu %d in %s\n", ptr, size,
|
||||||
|
gpu_index, usage.location.c_str());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
cuda_allocs.push_back(block);
|
||||||
|
}
|
||||||
|
void memset(int8_t *dest, uint64_t size, cudaStream_t stream,
|
||||||
|
const char *file, int line) {
|
||||||
|
#ifdef DEBUG_MEMORY_MANAGER
|
||||||
|
std::lock_guard<std::mutex> guard(allocs_mutex);
|
||||||
|
|
||||||
|
check_range_is_valid(MEMSET, dest, size, stream, file, line);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void memcpy(int8_t *dest, int8_t *src, uint64_t size, cudaStream_t stream,
|
||||||
|
const char *file, int line) {
|
||||||
|
#ifdef DEBUG_MEMORY_MANAGER
|
||||||
|
std::lock_guard<std::mutex> guard(allocs_mutex);
|
||||||
|
|
||||||
|
check_range_is_valid(MEMCPY_SRC, src, size, stream, file, line);
|
||||||
|
check_range_is_valid(MEMCPY_DEST, src, size, stream, file, line);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void free(void *ptr, CudaAllocType alloc_type, uint32_t gpu_index,
|
||||||
|
cudaStream_t stream, const char *file, int line) {
|
||||||
|
if (ptr == nullptr)
|
||||||
|
return;
|
||||||
|
|
||||||
|
std::lock_guard<std::mutex> guard(allocs_mutex);
|
||||||
|
|
||||||
|
bool found = false;
|
||||||
|
bool must_free = false;
|
||||||
|
|
||||||
|
for (auto it = cuda_allocs.begin(); it != cuda_allocs.end(); it++) {
|
||||||
|
if (it->ptr == ptr && it->gpu_index == gpu_index) {
|
||||||
|
found = true;
|
||||||
|
|
||||||
|
if (cache_size + it->size < (MAX_CACHE_SIZE)) {
|
||||||
|
cache[stream][it->size].push_back(*it);
|
||||||
|
cache_size += it->size;
|
||||||
|
if (peak_cache_size < cache_size) {
|
||||||
|
peak_cache_size = cache_size;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
cuda_freed.push_back(*it);
|
||||||
|
must_free = true;
|
||||||
|
}
|
||||||
|
#ifdef DEBUG_MEMORY_MANAGER
|
||||||
|
printf("cuda dropped buffer %p of size %lu on gpu %d\n", ptr, it->size,
|
||||||
|
gpu_index);
|
||||||
|
#endif
|
||||||
|
cuda_allocs.erase(it++);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (must_free) {
|
||||||
|
cuda_set_device(gpu_index);
|
||||||
|
if (alloc_type == SYNC) {
|
||||||
|
check_cuda_error(cudaFree(ptr));
|
||||||
|
} else if (alloc_type == ASYNC) {
|
||||||
|
#ifndef CUDART_VERSION
|
||||||
|
#error CUDART_VERSION Undefined!
|
||||||
|
#elif (CUDART_VERSION >= 11020)
|
||||||
|
int support_async_alloc;
|
||||||
|
check_cuda_error(cudaDeviceGetAttribute(
|
||||||
|
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
|
||||||
|
|
||||||
|
if (support_async_alloc) {
|
||||||
|
check_cuda_error(cudaFreeAsync(ptr, stream));
|
||||||
|
} else {
|
||||||
|
check_cuda_error(cudaFree(ptr));
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
check_cuda_error(cudaFree(ptr));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef DEBUG_MEMORY_MANAGER
|
||||||
|
if (!found) {
|
||||||
|
for (auto it = cuda_freed.begin(); it != cuda_freed.end(); it++) {
|
||||||
|
if (it->ptr == ptr && it->gpu_index == gpu_index) {
|
||||||
|
found = true;
|
||||||
|
printf("Drop in %s: %d\n", file, line);
|
||||||
|
printf("Alloc in %s\n", it->usages[0].location.c_str());
|
||||||
|
PANIC("cuda drop already dropped buffer %p of size %lu on gpu %d\n",
|
||||||
|
ptr, it->size, gpu_index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!found) {
|
||||||
|
PANIC("cuda drop unknown buffer %p\n", ptr);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
~CudaMemoryManager() {
|
||||||
|
#ifdef DEBUG_MEMORY_MANAGER
|
||||||
|
printf("%lu ALLOCATIONS AT PROGRAM EXIT\n", cuda_allocs.size());
|
||||||
|
|
||||||
|
for (auto &cuda_alloc : cuda_allocs) {
|
||||||
|
printf("%p of size %lu allocated at %s\n", cuda_alloc.ptr,
|
||||||
|
cuda_alloc.size, cuda_alloc.usages[0].location.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n\n\n %llu PEAK CACHE SIZE\n", peak_cache_size);
|
||||||
|
|
||||||
|
for (auto &cache_for_size : cache) {
|
||||||
|
for (auto &cuda_alloc : cache_for_size.second) {
|
||||||
|
printf("%p of size %lu cached at %s\n", cuda_alloc.ptr, cuda_alloc.size,
|
||||||
|
cuda_alloc.usages[0].location.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class CudaMultiGPUMemoryManager {
|
||||||
|
std::unordered_map<uint32_t, CudaMemoryManager> gMemManagers;
|
||||||
|
std::mutex gMemManagersMutex; // for creation of the mem managers
|
||||||
|
std::atomic<uint32_t> gMemManagerExists = 0;
|
||||||
|
|
||||||
|
public:
|
||||||
|
CudaMemoryManager &get(uint32_t gpu_index) {
|
||||||
|
if (gMemManagerExists.load() & (1 << gpu_index)) {
|
||||||
|
return gMemManagers[gpu_index];
|
||||||
|
} else {
|
||||||
|
std::lock_guard<std::mutex> guard(gMemManagersMutex);
|
||||||
|
uint32_t exist_flags = gMemManagerExists.load();
|
||||||
|
if (!(exist_flags & (1 << gpu_index))) {
|
||||||
|
gMemManagers[gpu_index]; // create it
|
||||||
|
gMemManagerExists.store(exist_flags | (1 << gpu_index));
|
||||||
|
}
|
||||||
|
return gMemManagers[gpu_index];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
CudaMultiGPUMemoryManager gCudaMemoryManager;
|
||||||
|
#endif
|
||||||
|
|
||||||
cudaEvent_t cuda_create_event(uint32_t gpu_index) {
|
cudaEvent_t cuda_create_event(uint32_t gpu_index) {
|
||||||
cuda_set_device(gpu_index);
|
cuda_set_device(gpu_index);
|
||||||
cudaEvent_t event;
|
cudaEvent_t event;
|
||||||
@@ -221,24 +560,38 @@ uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }
|
|||||||
/// or if there's not enough memory. A safe wrapper around it must call
|
/// or if there's not enough memory. A safe wrapper around it must call
|
||||||
/// cuda_check_valid_malloc() first
|
/// cuda_check_valid_malloc() first
|
||||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
|
void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
|
||||||
|
void *ptr = nullptr;
|
||||||
|
#ifdef USE_MEMORY_MANAGER
|
||||||
|
gCudaMemoryManager.get(gpu_index).alloc(&ptr, size, SYNC, gpu_index, 0,
|
||||||
|
"rust_code", 0);
|
||||||
|
#else
|
||||||
cuda_set_device(gpu_index);
|
cuda_set_device(gpu_index);
|
||||||
void *ptr;
|
|
||||||
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
||||||
|
#endif
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void *cuda_ext_malloc(uint64_t size, uint32_t gpu_index) {
|
||||||
|
return cuda_malloc(size, gpu_index);
|
||||||
|
}
|
||||||
|
|
||||||
/// Allocates a size-byte array at the device memory. Tries to do it
|
/// Allocates a size-byte array at the device memory. Tries to do it
|
||||||
/// asynchronously.
|
/// asynchronously.
|
||||||
void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
|
void *cuda_intern_malloc_with_size_tracking_async(uint64_t size,
|
||||||
uint32_t gpu_index,
|
cudaStream_t stream,
|
||||||
uint64_t &size_tracker,
|
uint32_t gpu_index,
|
||||||
bool allocate_gpu_memory) {
|
uint64_t &size_tracker,
|
||||||
|
bool allocate_gpu_memory,
|
||||||
|
const char *file, int line) {
|
||||||
size_tracker += size;
|
size_tracker += size;
|
||||||
void *ptr = nullptr;
|
void *ptr = nullptr;
|
||||||
if (!allocate_gpu_memory)
|
if (!allocate_gpu_memory)
|
||||||
return ptr;
|
return ptr;
|
||||||
|
|
||||||
|
#ifdef USE_MEMORY_MANAGER
|
||||||
|
gCudaMemoryManager.get(gpu_index).alloc(&ptr, size, ASYNC, gpu_index, stream,
|
||||||
|
file, line);
|
||||||
|
#else
|
||||||
cuda_set_device(gpu_index);
|
cuda_set_device(gpu_index);
|
||||||
|
|
||||||
#ifndef CUDART_VERSION
|
#ifndef CUDART_VERSION
|
||||||
@@ -256,16 +609,23 @@ void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
|
|||||||
#else
|
#else
|
||||||
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Allocates a size-byte array at the device memory. Tries to do it
|
/// Allocates a size-byte array at the device memory. Tries to do it
|
||||||
/// asynchronously.
|
/// asynchronously.
|
||||||
void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
|
void *cuda_int_malloc_async(uint64_t size, cudaStream_t stream,
|
||||||
uint32_t gpu_index) {
|
uint32_t gpu_index, const char *file, int line) {
|
||||||
uint64_t size_tracker = 0;
|
uint64_t size_tracker = 0;
|
||||||
return cuda_malloc_with_size_tracking_async(size, stream, gpu_index,
|
return cuda_intern_malloc_with_size_tracking_async(
|
||||||
size_tracker, true);
|
size, stream, gpu_index, size_tracker, true, file, line);
|
||||||
|
}
|
||||||
|
|
||||||
|
void *cuda_ext_malloc_async(uint64_t size, cudaStream_t stream,
|
||||||
|
uint32_t gpu_index) {
|
||||||
|
return cuda_malloc_async(size, stream, gpu_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check that allocation is valid
|
/// Check that allocation is valid
|
||||||
@@ -340,6 +700,11 @@ void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
|
|||||||
gpu_index, true);
|
gpu_index, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void cuda_ext_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
|
||||||
|
cudaStream_t stream, uint32_t gpu_index) {
|
||||||
|
cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
|
||||||
|
}
|
||||||
|
|
||||||
/// Copy memory within a GPU asynchronously
|
/// Copy memory within a GPU asynchronously
|
||||||
void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||||
void *dest, void const *src, uint64_t size, cudaStream_t stream,
|
void *dest, void const *src, uint64_t size, cudaStream_t stream,
|
||||||
@@ -372,6 +737,12 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
|||||||
gpu_index, true);
|
gpu_index, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void cuda_ext_memcpy_async_gpu_to_gpu(void *dest, void const *src,
|
||||||
|
uint64_t size, cudaStream_t stream,
|
||||||
|
uint32_t gpu_index) {
|
||||||
|
cuda_memcpy_async_gpu_to_gpu(dest, src, size, stream, gpu_index);
|
||||||
|
}
|
||||||
|
|
||||||
/// Copy memory within a GPU
|
/// Copy memory within a GPU
|
||||||
void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||||
uint32_t gpu_index) {
|
uint32_t gpu_index) {
|
||||||
@@ -396,6 +767,11 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void cuda_ext_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||||
|
uint32_t gpu_index) {
|
||||||
|
cuda_memcpy_gpu_to_gpu(dest, src, size, gpu_index);
|
||||||
|
}
|
||||||
|
|
||||||
/// Synchronizes device
|
/// Synchronizes device
|
||||||
void cuda_synchronize_device(uint32_t gpu_index) {
|
void cuda_synchronize_device(uint32_t gpu_index) {
|
||||||
cuda_set_device(gpu_index);
|
cuda_set_device(gpu_index);
|
||||||
@@ -408,6 +784,7 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
|
|||||||
bool gpu_memory_allocated) {
|
bool gpu_memory_allocated) {
|
||||||
if (size == 0 || !gpu_memory_allocated)
|
if (size == 0 || !gpu_memory_allocated)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
cudaPointerAttributes attr;
|
cudaPointerAttributes attr;
|
||||||
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
|
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
|
||||||
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
|
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||||
@@ -415,6 +792,7 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
|
|||||||
}
|
}
|
||||||
cuda_set_device(gpu_index);
|
cuda_set_device(gpu_index);
|
||||||
check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
|
check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
|
||||||
|
gCudaMemoryManager.get(gpu_index).memset((int8_t *)dest, size, stream, "", 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||||
@@ -423,6 +801,11 @@ void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
|||||||
true);
|
true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void cuda_ext_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||||
|
cudaStream_t stream, uint32_t gpu_index) {
|
||||||
|
cuda_memset_async(dest, val, size, stream, gpu_index);
|
||||||
|
}
|
||||||
|
|
||||||
template <typename Torus>
|
template <typename Torus>
|
||||||
__global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
|
__global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
|
||||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
@@ -474,6 +857,11 @@ void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
|||||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
|
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void cuda_ext_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||||
|
cudaStream_t stream, uint32_t gpu_index) {
|
||||||
|
cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
|
||||||
|
}
|
||||||
|
|
||||||
/// Return number of GPUs available
|
/// Return number of GPUs available
|
||||||
int cuda_get_number_of_gpus() {
|
int cuda_get_number_of_gpus() {
|
||||||
int num_gpus;
|
int num_gpus;
|
||||||
@@ -489,19 +877,31 @@ int cuda_get_number_of_sms() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Drop a cuda array
|
/// Drop a cuda array
|
||||||
void cuda_drop(void *ptr, uint32_t gpu_index) {
|
void cuda_int_drop(void *ptr, uint32_t gpu_index, const char *file, int line) {
|
||||||
|
#ifdef USE_MEMORY_MANAGER
|
||||||
|
gCudaMemoryManager.get(gpu_index).free(ptr, SYNC, gpu_index, 0, file, line);
|
||||||
|
#else
|
||||||
cuda_set_device(gpu_index);
|
cuda_set_device(gpu_index);
|
||||||
check_cuda_error(cudaFree(ptr));
|
check_cuda_error(cudaFree(ptr));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void cuda_ext_drop(void *ptr, uint32_t gpu_index) { cuda_drop(ptr, gpu_index); }
|
||||||
|
|
||||||
/// Drop a cuda array asynchronously, if the data was allocated & it's supported
|
/// Drop a cuda array asynchronously, if the data was allocated & it's supported
|
||||||
/// on the device
|
/// on the device
|
||||||
void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
|
void cuda_int_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
|
||||||
uint32_t gpu_index,
|
uint32_t gpu_index,
|
||||||
bool gpu_memory_allocated) {
|
bool gpu_memory_allocated,
|
||||||
|
const char *file, int line) {
|
||||||
|
|
||||||
if (!gpu_memory_allocated)
|
if (!gpu_memory_allocated)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
#ifdef USE_MEMORY_MANAGER
|
||||||
|
gCudaMemoryManager.get(gpu_index).free(ptr, ASYNC, gpu_index, stream, file,
|
||||||
|
line);
|
||||||
|
#else
|
||||||
cuda_set_device(gpu_index);
|
cuda_set_device(gpu_index);
|
||||||
#ifndef CUDART_VERSION
|
#ifndef CUDART_VERSION
|
||||||
#error CUDART_VERSION Undefined!
|
#error CUDART_VERSION Undefined!
|
||||||
@@ -518,11 +918,14 @@ void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
|
|||||||
#else
|
#else
|
||||||
check_cuda_error(cudaFree(ptr));
|
check_cuda_error(cudaFree(ptr));
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Drop a cuda array asynchronously, if supported on the device
|
/// Drop a cuda array asynchronously, if supported on the device
|
||||||
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
|
void cuda_int_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index,
|
||||||
cuda_drop_with_size_tracking_async(ptr, stream, gpu_index, true);
|
const char *file, int line) {
|
||||||
|
cuda_int_drop_with_size_tracking_async(ptr, stream, gpu_index, true, file,
|
||||||
|
line);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the maximum size for the shared memory per streaming multiprocessors
|
/// Get the maximum size for the shared memory per streaming multiprocessors
|
||||||
|
|||||||
@@ -11,21 +11,13 @@ extern "C" {
|
|||||||
|
|
||||||
pub fn cuda_is_available() -> u32;
|
pub fn cuda_is_available() -> u32;
|
||||||
|
|
||||||
pub fn cuda_malloc(size: u64, gpu_index: u32) -> *mut c_void;
|
pub fn cuda_ext_malloc(size: u64, gpu_index: u32) -> *mut c_void;
|
||||||
|
|
||||||
pub fn cuda_malloc_with_size_tracking_async(
|
pub fn cuda_ext_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void;
|
||||||
size: u64,
|
|
||||||
stream: *mut c_void,
|
|
||||||
gpu_index: u32,
|
|
||||||
size_tracker: *mut u64,
|
|
||||||
allocate_gpu_memory: bool,
|
|
||||||
) -> *mut c_void;
|
|
||||||
|
|
||||||
pub fn cuda_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void;
|
|
||||||
pub fn cuda_check_valid_malloc(size: u64, gpu_index: u32) -> bool;
|
pub fn cuda_check_valid_malloc(size: u64, gpu_index: u32) -> bool;
|
||||||
pub fn cuda_device_total_memory(gpu_index: u32) -> u64;
|
pub fn cuda_device_total_memory(gpu_index: u32) -> u64;
|
||||||
|
|
||||||
pub fn cuda_memcpy_with_size_tracking_async_to_gpu(
|
pub fn cuda_ext_memcpy_with_size_tracking_async_to_gpu(
|
||||||
dest: *mut c_void,
|
dest: *mut c_void,
|
||||||
src: *const c_void,
|
src: *const c_void,
|
||||||
size: u64,
|
size: u64,
|
||||||
@@ -34,7 +26,7 @@ extern "C" {
|
|||||||
gpu_memory_allocated: bool,
|
gpu_memory_allocated: bool,
|
||||||
);
|
);
|
||||||
|
|
||||||
pub fn cuda_memcpy_async_to_gpu(
|
pub fn cuda_ext_memcpy_async_to_gpu(
|
||||||
dest: *mut c_void,
|
dest: *mut c_void,
|
||||||
src: *const c_void,
|
src: *const c_void,
|
||||||
size: u64,
|
size: u64,
|
||||||
@@ -42,9 +34,14 @@ extern "C" {
|
|||||||
gpu_index: u32,
|
gpu_index: u32,
|
||||||
);
|
);
|
||||||
|
|
||||||
pub fn cuda_memcpy_gpu_to_gpu(dest: *mut c_void, src: *const c_void, size: u64, gpu_index: u32);
|
pub fn cuda_ext_memcpy_gpu_to_gpu(
|
||||||
|
dest: *mut c_void,
|
||||||
|
src: *const c_void,
|
||||||
|
size: u64,
|
||||||
|
gpu_index: u32,
|
||||||
|
);
|
||||||
|
|
||||||
pub fn cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
pub fn cuda_ext_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||||
dest: *mut c_void,
|
dest: *mut c_void,
|
||||||
src: *const c_void,
|
src: *const c_void,
|
||||||
size: u64,
|
size: u64,
|
||||||
@@ -53,7 +50,7 @@ extern "C" {
|
|||||||
gpu_memory_allocated: bool,
|
gpu_memory_allocated: bool,
|
||||||
);
|
);
|
||||||
|
|
||||||
pub fn cuda_memcpy_async_gpu_to_gpu(
|
pub fn cuda_ext_memcpy_async_gpu_to_gpu(
|
||||||
dest: *mut c_void,
|
dest: *mut c_void,
|
||||||
src: *const c_void,
|
src: *const c_void,
|
||||||
size: u64,
|
size: u64,
|
||||||
@@ -61,7 +58,7 @@ extern "C" {
|
|||||||
gpu_index: u32,
|
gpu_index: u32,
|
||||||
);
|
);
|
||||||
|
|
||||||
pub fn cuda_memcpy_async_to_cpu(
|
pub fn cuda_ext_memcpy_async_to_cpu(
|
||||||
dest: *mut c_void,
|
dest: *mut c_void,
|
||||||
src: *const c_void,
|
src: *const c_void,
|
||||||
size: u64,
|
size: u64,
|
||||||
@@ -78,7 +75,7 @@ extern "C" {
|
|||||||
gpu_memory_allocated: bool,
|
gpu_memory_allocated: bool,
|
||||||
);
|
);
|
||||||
|
|
||||||
pub fn cuda_memset_async(
|
pub fn cuda_ext_memset_async(
|
||||||
dest: *mut c_void,
|
dest: *mut c_void,
|
||||||
val: u64,
|
val: u64,
|
||||||
size: u64,
|
size: u64,
|
||||||
@@ -92,7 +89,7 @@ extern "C" {
|
|||||||
|
|
||||||
pub fn cuda_synchronize_device(gpu_index: u32);
|
pub fn cuda_synchronize_device(gpu_index: u32);
|
||||||
|
|
||||||
pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32);
|
pub fn cuda_ext_drop(ptr: *mut c_void, gpu_index: u32);
|
||||||
|
|
||||||
pub fn cuda_drop_with_size_tracking_async(
|
pub fn cuda_drop_with_size_tracking_async(
|
||||||
ptr: *mut c_void,
|
ptr: *mut c_void,
|
||||||
@@ -102,7 +99,7 @@ extern "C" {
|
|||||||
allocate_gpu_memory: bool,
|
allocate_gpu_memory: bool,
|
||||||
);
|
);
|
||||||
|
|
||||||
pub fn cuda_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32);
|
pub fn cuda_ext_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32);
|
||||||
|
|
||||||
pub fn cuda_setup_multi_gpu(gpu_index: u32) -> i32;
|
pub fn cuda_setup_multi_gpu(gpu_index: u32) -> i32;
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use crate::core_crypto::prelude::{
|
|||||||
CiphertextModulus, Container, LweCiphertext, LweCiphertextCount, LweCiphertextList,
|
CiphertextModulus, Container, LweCiphertext, LweCiphertextCount, LweCiphertextList,
|
||||||
LweDimension, LweSize, UnsignedInteger,
|
LweDimension, LweSize, UnsignedInteger,
|
||||||
};
|
};
|
||||||
use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_gpu_to_gpu;
|
use tfhe_cuda_backend::cuda_bind::cuda_ext_memcpy_async_gpu_to_gpu;
|
||||||
|
|
||||||
/// A structure representing a vector of LWE ciphertexts with 64 bits of precision on the GPU.
|
/// A structure representing a vector of LWE ciphertexts with 64 bits of precision on the GPU.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
@@ -123,7 +123,7 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
|
|||||||
* std::mem::size_of::<T>();
|
* std::mem::size_of::<T>();
|
||||||
// Concatenate gpu_index memory
|
// Concatenate gpu_index memory
|
||||||
unsafe {
|
unsafe {
|
||||||
cuda_memcpy_async_gpu_to_gpu(
|
cuda_ext_memcpy_async_gpu_to_gpu(
|
||||||
ptr,
|
ptr,
|
||||||
first_item.0.d_vec.as_c_ptr(0),
|
first_item.0.d_vec.as_c_ptr(0),
|
||||||
size as u64,
|
size as u64,
|
||||||
@@ -132,7 +132,7 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
|
|||||||
);
|
);
|
||||||
ptr = ptr.wrapping_byte_add(size);
|
ptr = ptr.wrapping_byte_add(size);
|
||||||
for list in cuda_ciphertexts_list_vec {
|
for list in cuda_ciphertexts_list_vec {
|
||||||
cuda_memcpy_async_gpu_to_gpu(
|
cuda_ext_memcpy_async_gpu_to_gpu(
|
||||||
ptr,
|
ptr,
|
||||||
list.0.d_vec.as_c_ptr(0),
|
list.0.d_vec.as_c_ptr(0),
|
||||||
size as u64,
|
size as u64,
|
||||||
|
|||||||
@@ -3,7 +3,9 @@ use crate::core_crypto::gpu::CudaStreams;
|
|||||||
use crate::core_crypto::prelude::Numeric;
|
use crate::core_crypto::prelude::Numeric;
|
||||||
use std::ffi::c_void;
|
use std::ffi::c_void;
|
||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
use tfhe_cuda_backend::cuda_bind::{cuda_memcpy_async_gpu_to_gpu, cuda_memcpy_async_to_cpu};
|
use tfhe_cuda_backend::cuda_bind::{
|
||||||
|
cuda_ext_memcpy_async_gpu_to_gpu, cuda_ext_memcpy_async_to_cpu,
|
||||||
|
};
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct CudaSlice<'a, T: Numeric> {
|
pub struct CudaSlice<'a, T: Numeric> {
|
||||||
@@ -98,7 +100,7 @@ where
|
|||||||
let size = src.len(index) * std::mem::size_of::<T>();
|
let size = src.len(index) * std::mem::size_of::<T>();
|
||||||
// We check that src is not empty to avoid invalid pointers
|
// We check that src is not empty to avoid invalid pointers
|
||||||
if size > 0 {
|
if size > 0 {
|
||||||
cuda_memcpy_async_gpu_to_gpu(
|
cuda_ext_memcpy_async_gpu_to_gpu(
|
||||||
self.as_mut_c_ptr(index),
|
self.as_mut_c_ptr(index),
|
||||||
src.as_c_ptr(index),
|
src.as_c_ptr(index),
|
||||||
size as u64,
|
size as u64,
|
||||||
@@ -123,7 +125,7 @@ where
|
|||||||
let size = self.len(index) * std::mem::size_of::<T>();
|
let size = self.len(index) * std::mem::size_of::<T>();
|
||||||
// We check that src is not empty to avoid invalid pointers
|
// We check that src is not empty to avoid invalid pointers
|
||||||
if size > 0 {
|
if size > 0 {
|
||||||
cuda_memcpy_async_to_cpu(
|
cuda_ext_memcpy_async_to_cpu(
|
||||||
dest.as_mut_ptr().cast::<c_void>(),
|
dest.as_mut_ptr().cast::<c_void>(),
|
||||||
self.as_c_ptr(index),
|
self.as_c_ptr(index),
|
||||||
size as u64,
|
size as u64,
|
||||||
|
|||||||
@@ -6,9 +6,9 @@ use std::collections::Bound::{Excluded, Included, Unbounded};
|
|||||||
use std::ffi::c_void;
|
use std::ffi::c_void;
|
||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
use tfhe_cuda_backend::cuda_bind::{
|
use tfhe_cuda_backend::cuda_bind::{
|
||||||
cuda_drop, cuda_malloc, cuda_malloc_async, cuda_memcpy_async_gpu_to_gpu,
|
cuda_ext_drop, cuda_ext_malloc, cuda_ext_malloc_async, cuda_ext_memcpy_async_gpu_to_gpu,
|
||||||
cuda_memcpy_async_to_cpu, cuda_memcpy_async_to_gpu, cuda_memcpy_gpu_to_gpu, cuda_memset_async,
|
cuda_ext_memcpy_async_to_cpu, cuda_ext_memcpy_async_to_gpu, cuda_ext_memcpy_gpu_to_gpu,
|
||||||
cuda_synchronize_device,
|
cuda_ext_memset_async, cuda_synchronize_device,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||||
@@ -74,8 +74,8 @@ impl<T: Numeric> Clone for CudaVec<T> {
|
|||||||
for (index, &gpu_index) in self.gpu_indexes.iter().enumerate() {
|
for (index, &gpu_index) in self.gpu_indexes.iter().enumerate() {
|
||||||
unsafe {
|
unsafe {
|
||||||
cuda_synchronize_device(gpu_index.0);
|
cuda_synchronize_device(gpu_index.0);
|
||||||
let ptr = cuda_malloc(size, gpu_index.0);
|
let ptr = cuda_ext_malloc(size, gpu_index.0);
|
||||||
cuda_memcpy_gpu_to_gpu(ptr, self.ptr[index], size, gpu_index.0);
|
cuda_ext_memcpy_gpu_to_gpu(ptr, self.ptr[index], size, gpu_index.0);
|
||||||
cloned_vec.push(ptr);
|
cloned_vec.push(ptr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -101,12 +101,12 @@ impl<T: Numeric> CudaVec<T> {
|
|||||||
/// - `streams` __must__ be synchronized to guarantee computation has finished
|
/// - `streams` __must__ be synchronized to guarantee computation has finished
|
||||||
pub unsafe fn new_async(len: usize, streams: &CudaStreams, stream_index: u32) -> Self {
|
pub unsafe fn new_async(len: usize, streams: &CudaStreams, stream_index: u32) -> Self {
|
||||||
let size = len as u64 * std::mem::size_of::<T>() as u64;
|
let size = len as u64 * std::mem::size_of::<T>() as u64;
|
||||||
let ptr = cuda_malloc_async(
|
let ptr = cuda_ext_malloc_async(
|
||||||
size,
|
size,
|
||||||
streams.ptr[stream_index as usize],
|
streams.ptr[stream_index as usize],
|
||||||
streams.gpu_indexes[stream_index as usize].0,
|
streams.gpu_indexes[stream_index as usize].0,
|
||||||
);
|
);
|
||||||
cuda_memset_async(
|
cuda_ext_memset_async(
|
||||||
ptr,
|
ptr,
|
||||||
0u64,
|
0u64,
|
||||||
size,
|
size,
|
||||||
@@ -129,9 +129,9 @@ impl<T: Numeric> CudaVec<T> {
|
|||||||
let mut ptrs = Vec::with_capacity(streams.len());
|
let mut ptrs = Vec::with_capacity(streams.len());
|
||||||
for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
|
for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
|
||||||
let gpu_index = streams.gpu_indexes[i];
|
let gpu_index = streams.gpu_indexes[i];
|
||||||
let ptr = unsafe { cuda_malloc_async(size, stream_ptr, gpu_index.0) };
|
let ptr = unsafe { cuda_ext_malloc_async(size, stream_ptr, gpu_index.0) };
|
||||||
unsafe {
|
unsafe {
|
||||||
cuda_memset_async(ptr, 0u64, size, stream_ptr, gpu_index.0);
|
cuda_ext_memset_async(ptr, 0u64, size, stream_ptr, gpu_index.0);
|
||||||
}
|
}
|
||||||
streams.synchronize_one(i as u32);
|
streams.synchronize_one(i as u32);
|
||||||
ptrs.push(ptr);
|
ptrs.push(ptr);
|
||||||
@@ -179,7 +179,7 @@ impl<T: Numeric> CudaVec<T> {
|
|||||||
let size = self.len() * std::mem::size_of::<T>();
|
let size = self.len() * std::mem::size_of::<T>();
|
||||||
// We check that self is not empty to avoid invalid pointers
|
// We check that self is not empty to avoid invalid pointers
|
||||||
if size > 0 {
|
if size > 0 {
|
||||||
cuda_memset_async(
|
cuda_ext_memset_async(
|
||||||
self.as_mut_c_ptr(stream_index),
|
self.as_mut_c_ptr(stream_index),
|
||||||
value,
|
value,
|
||||||
size as u64,
|
size as u64,
|
||||||
@@ -209,7 +209,7 @@ impl<T: Numeric> CudaVec<T> {
|
|||||||
// We have to check that src is not empty, because Rust slice with size 0 results in an
|
// We have to check that src is not empty, because Rust slice with size 0 results in an
|
||||||
// invalid pointer being passed to copy_to_gpu_async
|
// invalid pointer being passed to copy_to_gpu_async
|
||||||
if size > 0 {
|
if size > 0 {
|
||||||
cuda_memcpy_async_to_gpu(
|
cuda_ext_memcpy_async_to_gpu(
|
||||||
self.as_mut_c_ptr(stream_index),
|
self.as_mut_c_ptr(stream_index),
|
||||||
src.as_ptr().cast(),
|
src.as_ptr().cast(),
|
||||||
size as u64,
|
size as u64,
|
||||||
@@ -237,7 +237,7 @@ impl<T: Numeric> CudaVec<T> {
|
|||||||
// invalid pointer being passed to copy_to_gpu_async
|
// invalid pointer being passed to copy_to_gpu_async
|
||||||
if size > 0 {
|
if size > 0 {
|
||||||
let gpu_index = streams.gpu_indexes[i];
|
let gpu_index = streams.gpu_indexes[i];
|
||||||
cuda_memcpy_async_to_gpu(
|
cuda_ext_memcpy_async_to_gpu(
|
||||||
self.get_mut_c_ptr(i as u32),
|
self.get_mut_c_ptr(i as u32),
|
||||||
src.as_ptr().cast(),
|
src.as_ptr().cast(),
|
||||||
size as u64,
|
size as u64,
|
||||||
@@ -266,7 +266,7 @@ impl<T: Numeric> CudaVec<T> {
|
|||||||
let size = src.len() * std::mem::size_of::<T>();
|
let size = src.len() * std::mem::size_of::<T>();
|
||||||
// We check that src is not empty to avoid invalid pointers
|
// We check that src is not empty to avoid invalid pointers
|
||||||
if size > 0 {
|
if size > 0 {
|
||||||
cuda_memcpy_async_gpu_to_gpu(
|
cuda_ext_memcpy_async_gpu_to_gpu(
|
||||||
self.as_mut_c_ptr(stream_index),
|
self.as_mut_c_ptr(stream_index),
|
||||||
src.as_c_ptr(stream_index),
|
src.as_c_ptr(stream_index),
|
||||||
size as u64,
|
size as u64,
|
||||||
@@ -305,7 +305,7 @@ impl<T: Numeric> CudaVec<T> {
|
|||||||
.as_c_ptr(stream_index)
|
.as_c_ptr(stream_index)
|
||||||
.add(start * std::mem::size_of::<T>());
|
.add(start * std::mem::size_of::<T>());
|
||||||
let size = (end - start + 1) * std::mem::size_of::<T>();
|
let size = (end - start + 1) * std::mem::size_of::<T>();
|
||||||
cuda_memcpy_async_gpu_to_gpu(
|
cuda_ext_memcpy_async_gpu_to_gpu(
|
||||||
self.as_mut_c_ptr(stream_index),
|
self.as_mut_c_ptr(stream_index),
|
||||||
src_ptr,
|
src_ptr,
|
||||||
size as u64,
|
size as u64,
|
||||||
@@ -342,7 +342,7 @@ impl<T: Numeric> CudaVec<T> {
|
|||||||
.as_mut_c_ptr(stream_index)
|
.as_mut_c_ptr(stream_index)
|
||||||
.add(start * std::mem::size_of::<T>());
|
.add(start * std::mem::size_of::<T>());
|
||||||
let size = (end - start + 1) * std::mem::size_of::<T>();
|
let size = (end - start + 1) * std::mem::size_of::<T>();
|
||||||
cuda_memcpy_async_gpu_to_gpu(
|
cuda_ext_memcpy_async_gpu_to_gpu(
|
||||||
dest_ptr,
|
dest_ptr,
|
||||||
src.as_c_ptr(stream_index),
|
src.as_c_ptr(stream_index),
|
||||||
size as u64,
|
size as u64,
|
||||||
@@ -366,7 +366,7 @@ impl<T: Numeric> CudaVec<T> {
|
|||||||
// We have to check that self is not empty, because Rust slice with size 0 results in an
|
// We have to check that self is not empty, because Rust slice with size 0 results in an
|
||||||
// invalid pointer being passed to copy_to_cpu_async
|
// invalid pointer being passed to copy_to_cpu_async
|
||||||
if size > 0 {
|
if size > 0 {
|
||||||
cuda_memcpy_async_to_cpu(
|
cuda_ext_memcpy_async_to_cpu(
|
||||||
dest.as_mut_ptr().cast(),
|
dest.as_mut_ptr().cast(),
|
||||||
self.as_c_ptr(stream_index),
|
self.as_c_ptr(stream_index),
|
||||||
size as u64,
|
size as u64,
|
||||||
@@ -484,7 +484,7 @@ impl<T: Numeric> Drop for CudaVec<T> {
|
|||||||
// Synchronizes the device to be sure no stream is still using this pointer
|
// Synchronizes the device to be sure no stream is still using this pointer
|
||||||
let gpu_index = self.gpu_indexes[i];
|
let gpu_index = self.gpu_indexes[i];
|
||||||
synchronize_device(gpu_index.0);
|
synchronize_device(gpu_index.0);
|
||||||
unsafe { cuda_drop(ptr, gpu_index.0) };
|
unsafe { cuda_ext_drop(ptr, gpu_index.0) };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ use crate::shortint::{AtomicPatternKind, CarryModulus, Ciphertext, MessageModulu
|
|||||||
use crate::GpuIndex;
|
use crate::GpuIndex;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use serde::Deserializer;
|
use serde::Deserializer;
|
||||||
use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_to_gpu;
|
use tfhe_cuda_backend::cuda_bind::cuda_ext_memcpy_async_to_gpu;
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct CudaCompactCiphertextListInfo {
|
pub struct CudaCompactCiphertextListInfo {
|
||||||
@@ -271,7 +271,7 @@ impl CudaFlattenedVecCompactCiphertextList {
|
|||||||
let dest_ptr = d_flattened_d_vec
|
let dest_ptr = d_flattened_d_vec
|
||||||
.as_mut_c_ptr(0)
|
.as_mut_c_ptr(0)
|
||||||
.add(offset * std::mem::size_of::<u64>());
|
.add(offset * std::mem::size_of::<u64>());
|
||||||
cuda_memcpy_async_to_gpu(
|
cuda_ext_memcpy_async_to_gpu(
|
||||||
dest_ptr,
|
dest_ptr,
|
||||||
container.as_ptr().cast(),
|
container.as_ptr().cast(),
|
||||||
(expected_length * std::mem::size_of::<u64>()) as u64,
|
(expected_length * std::mem::size_of::<u64>()) as u64,
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ use crate::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphe
|
|||||||
use crate::integer::gpu::ciphertext::CudaRadixCiphertext;
|
use crate::integer::gpu::ciphertext::CudaRadixCiphertext;
|
||||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||||
use crate::integer::gpu::{
|
use crate::integer::gpu::{
|
||||||
compress_integer_radix_async, cuda_memcpy_async_gpu_to_gpu, decompress_integer_radix_async_64,
|
compress_integer_radix_async, cuda_ext_memcpy_async_gpu_to_gpu, decompress_integer_radix_async_64,
|
||||||
get_compression_size_on_gpu, get_decompression_size_on_gpu,
|
get_compression_size_on_gpu, get_decompression_size_on_gpu,
|
||||||
};
|
};
|
||||||
use crate::prelude::CastInto;
|
use crate::prelude::CastInto;
|
||||||
@@ -248,7 +248,7 @@ impl CudaCompressionKey {
|
|||||||
.as_mut_c_ptr(0)
|
.as_mut_c_ptr(0)
|
||||||
.add(offset * std::mem::size_of::<u64>());
|
.add(offset * std::mem::size_of::<u64>());
|
||||||
let size = ciphertext.d_blocks.0.d_vec.len * std::mem::size_of::<u64>();
|
let size = ciphertext.d_blocks.0.d_vec.len * std::mem::size_of::<u64>();
|
||||||
cuda_memcpy_async_gpu_to_gpu(
|
cuda_ext_memcpy_async_gpu_to_gpu(
|
||||||
dest_ptr,
|
dest_ptr,
|
||||||
ciphertext.d_blocks.0.d_vec.as_c_ptr(0),
|
ciphertext.d_blocks.0.d_vec.as_c_ptr(0),
|
||||||
size as u64,
|
size as u64,
|
||||||
@@ -577,7 +577,7 @@ impl CudaNoiseSquashingCompressionKey {
|
|||||||
.as_mut_c_ptr(0)
|
.as_mut_c_ptr(0)
|
||||||
.add(offset * std::mem::size_of::<u128>());
|
.add(offset * std::mem::size_of::<u128>());
|
||||||
let size = ciphertext.packed_d_blocks.0.d_vec.len * std::mem::size_of::<u128>();
|
let size = ciphertext.packed_d_blocks.0.d_vec.len * std::mem::size_of::<u128>();
|
||||||
cuda_memcpy_async_gpu_to_gpu(
|
cuda_ext_memcpy_async_gpu_to_gpu(
|
||||||
dest_ptr,
|
dest_ptr,
|
||||||
ciphertext.packed_d_blocks.0.d_vec.as_c_ptr(0),
|
ciphertext.packed_d_blocks.0.d_vec.as_c_ptr(0),
|
||||||
size as u64,
|
size as u64,
|
||||||
|
|||||||
Reference in New Issue
Block a user