mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 15:48:20 -05:00
Compare commits
3 Commits
al/debug_l
...
al/stf_exp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6a42ecb970 | ||
|
|
d51fba12d4 | ||
|
|
27f44e897d |
@@ -61,6 +61,7 @@ fn main() {
|
||||
}
|
||||
println!("cargo:rustc-link-lib=gomp");
|
||||
println!("cargo:rustc-link-lib=cudart");
|
||||
println!("cargo:rustc-link-lib=cuda");
|
||||
println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
|
||||
println!("cargo:rustc-link-lib=stdc++");
|
||||
|
||||
|
||||
@@ -88,11 +88,34 @@ else()
|
||||
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
|
||||
endif()
|
||||
|
||||
# Fetch CPM.cmake directly from GitHub if not already present
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
CPM
|
||||
GIT_REPOSITORY https://github.com/cpm-cmake/CPM.cmake
|
||||
GIT_TAG v0.38.5 # replace with the desired version or main for latest
|
||||
)
|
||||
FetchContent_MakeAvailable(CPM)
|
||||
|
||||
include(${cpm_SOURCE_DIR}/cmake/CPM.cmake)
|
||||
|
||||
# This will automatically clone CCCL from GitHub and make the exported cmake targets available
|
||||
cpmaddpackage(
|
||||
NAME
|
||||
CCCL
|
||||
GITHUB_REPOSITORY
|
||||
"nvidia/cccl"
|
||||
GIT_TAG
|
||||
"main"
|
||||
# The following is required to make the `CCCL::cudax` target available:
|
||||
OPTIONS
|
||||
"CCCL_ENABLE_UNSTABLE ON")
|
||||
|
||||
# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
|
||||
set(CMAKE_CUDA_FLAGS
|
||||
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
|
||||
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
|
||||
--use_fast_math -Xcompiler -fPIC")
|
||||
--use_fast_math -Xcompiler -fPIC -DCCCL_DISABLE_EXCEPTIONS -DCUDASTF_DISABLE_CODE_GENERATION")
|
||||
|
||||
set(INCLUDE_DIR include)
|
||||
|
||||
@@ -101,6 +124,8 @@ enable_testing()
|
||||
add_subdirectory(tests_and_benchmarks)
|
||||
target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})
|
||||
|
||||
target_link_libraries(tfhe_cuda_backend PRIVATE CCCL::CCCL CCCL::cudax cuda)
|
||||
|
||||
# This is required for rust cargo build
|
||||
install(TARGETS tfhe_cuda_backend DESTINATION .)
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
file(GLOB_RECURSE SOURCES "*.cu")
|
||||
add_library(tfhe_cuda_backend STATIC ${SOURCES})
|
||||
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
|
||||
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX cuda)
|
||||
target_include_directories(tfhe_cuda_backend PRIVATE .)
|
||||
|
||||
@@ -16,8 +16,11 @@
|
||||
#include "programmable_bootstrap.cuh"
|
||||
#include "programmable_bootstrap_multibit.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
#include <cuda/experimental/stf.cuh>
|
||||
#include <vector>
|
||||
|
||||
namespace cudastf = cuda::experimental::stf;
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate(
|
||||
@@ -384,25 +387,49 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
// Generate a CUDA graph if the USE_CUDA_GRAPH is set to a non-null value
|
||||
const char *use_graph_env = getenv("USE_CUDA_GRAPH");
|
||||
|
||||
cudastf::context ctx(stream);
|
||||
if (use_graph_env && atoi(use_graph_env) != 0) {
|
||||
ctx = cudastf::graph_ctx(stream);
|
||||
}
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
|
||||
auto buffer_token = ctx.logical_token();
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
auto key_token = ctx.logical_token();
|
||||
auto result_token = ctx.logical_token();
|
||||
|
||||
// Compute a keybundle
|
||||
execute_compute_keybundle<Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset);
|
||||
ctx.task(key_token.write(), buffer_token.write())
|
||||
.set_symbol("compute_keybundle")
|
||||
->*[&](cudaStream_t stf_stream) {
|
||||
execute_compute_keybundle<Torus, params>(
|
||||
stf_stream, gpu_index, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor,
|
||||
level_count, lwe_offset);
|
||||
};
|
||||
|
||||
// Accumulate
|
||||
execute_cg_external_product_loop<Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
|
||||
lut_stride);
|
||||
ctx.task(key_token.read(), buffer_token.rw(), result_token.write())
|
||||
.set_symbol("accumulate")
|
||||
->*
|
||||
[&](cudaStream_t stf_stream) {
|
||||
execute_cg_external_product_loop<Torus, params>(
|
||||
stf_stream, gpu_index, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, lwe_array_out,
|
||||
lwe_output_indexes, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log,
|
||||
level_count, lwe_offset, num_many_lut, lut_stride);
|
||||
};
|
||||
}
|
||||
ctx.finalize();
|
||||
}
|
||||
|
||||
// Verify if the grid size satisfies the cooperative group constraints
|
||||
|
||||
@@ -16,8 +16,11 @@
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "programmable_bootstrap.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
#include <cuda/experimental/stf.cuh>
|
||||
#include <vector>
|
||||
|
||||
namespace cudastf = cuda::experimental::stf;
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate(
|
||||
@@ -404,23 +407,44 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
// Generate a CUDA graph if the USE_CUDA_GRAPH is set to a non-null value
|
||||
const char *use_graph_env = getenv("USE_CUDA_GRAPH");
|
||||
|
||||
cudastf::context ctx(stream);
|
||||
if (use_graph_env && atoi(use_graph_env) != 0) {
|
||||
ctx = cudastf::graph_ctx(stream);
|
||||
}
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
auto buffer_token = ctx.logical_token();
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
auto key_token = ctx.logical_token();
|
||||
auto result_token = ctx.logical_token();
|
||||
// Compute a keybundle
|
||||
execute_compute_keybundle<Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset);
|
||||
ctx.task(key_token.write(), buffer_token.write())
|
||||
.set_symbol("compute_keybundle")
|
||||
->*[&](cudaStream_t stf_stream) {
|
||||
execute_compute_keybundle<Torus, params>(
|
||||
stf_stream, gpu_index, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor,
|
||||
level_count, lwe_offset);
|
||||
};
|
||||
|
||||
// Accumulate
|
||||
execute_tbc_external_product_loop<Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
|
||||
lut_stride);
|
||||
ctx.task(key_token.read(), buffer_token.rw(), result_token.write())
|
||||
.set_symbol("accumulate")
|
||||
->*
|
||||
[&](cudaStream_t stf_stream) {
|
||||
execute_tbc_external_product_loop<Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
|
||||
lut_stride);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user