Compare commits

...

3 Commits

Author SHA1 Message Date
Agnes Leroy
6a42ecb970 Bench runs 2025-04-11 09:44:10 +02:00
Agnes Leroy
d51fba12d4 Fix build rust 2025-04-11 09:40:28 +02:00
Agnes Leroy
27f44e897d chore(gpu): stf experiment 2025-04-10 15:55:51 +02:00
5 changed files with 99 additions and 22 deletions

View File

@@ -61,6 +61,7 @@ fn main() {
}
println!("cargo:rustc-link-lib=gomp");
println!("cargo:rustc-link-lib=cudart");
println!("cargo:rustc-link-lib=cuda");
println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
println!("cargo:rustc-link-lib=stdc++");

View File

@@ -88,11 +88,34 @@ else()
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
endif()
# Fetch CPM.cmake directly from GitHub if not already present
include(FetchContent)
FetchContent_Declare(
CPM
GIT_REPOSITORY https://github.com/cpm-cmake/CPM.cmake
GIT_TAG v0.38.5 # replace with the desired version or main for latest
)
FetchContent_MakeAvailable(CPM)
include(${cpm_SOURCE_DIR}/cmake/CPM.cmake)
# This will automatically clone CCCL from GitHub and make the exported cmake targets available
cpmaddpackage(
NAME
CCCL
GITHUB_REPOSITORY
"nvidia/cccl"
GIT_TAG
"main"
# The following is required to make the `CCCL::cudax` target available:
OPTIONS
"CCCL_ENABLE_UNSTABLE ON")
# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
set(CMAKE_CUDA_FLAGS
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
--use_fast_math -Xcompiler -fPIC")
--use_fast_math -Xcompiler -fPIC -DCCCL_DISABLE_EXCEPTIONS -DCUDASTF_DISABLE_CODE_GENERATION")
set(INCLUDE_DIR include)
@@ -101,6 +124,8 @@ enable_testing()
add_subdirectory(tests_and_benchmarks)
target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})
target_link_libraries(tfhe_cuda_backend PRIVATE CCCL::CCCL CCCL::cudax cuda)
# This is required for rust cargo build
install(TARGETS tfhe_cuda_backend DESTINATION .)

View File

@@ -1,5 +1,5 @@
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES})
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX cuda)
target_include_directories(tfhe_cuda_backend PRIVATE .)

View File

@@ -16,8 +16,11 @@
#include "programmable_bootstrap.cuh"
#include "programmable_bootstrap_multibit.cuh"
#include "types/complex/operations.cuh"
#include <cuda/experimental/stf.cuh>
#include <vector>
namespace cudastf = cuda::experimental::stf;
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_cg_accumulate(
@@ -384,25 +387,49 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
// Generate a CUDA graph if the USE_CUDA_GRAPH is set to a non-null value
const char *use_graph_env = getenv("USE_CUDA_GRAPH");
cudastf::context ctx(stream);
if (use_graph_env && atoi(use_graph_env) != 0) {
ctx = cudastf::graph_ctx(stream);
}
auto lwe_chunk_size = buffer->lwe_chunk_size;
auto buffer_token = ctx.logical_token();
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {
auto key_token = ctx.logical_token();
auto result_token = ctx.logical_token();
// Compute a keybundle
execute_compute_keybundle<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
ctx.task(key_token.write(), buffer_token.write())
.set_symbol("compute_keybundle")
->*[&](cudaStream_t stf_stream) {
execute_compute_keybundle<Torus, params>(
stf_stream, gpu_index, lwe_array_in, lwe_input_indexes,
bootstrapping_key, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor,
level_count, lwe_offset);
};
// Accumulate
execute_cg_external_product_loop<Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
lut_stride);
ctx.task(key_token.read(), buffer_token.rw(), result_token.write())
.set_symbol("accumulate")
->*
[&](cudaStream_t stf_stream) {
execute_cg_external_product_loop<Torus, params>(
stf_stream, gpu_index, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, lwe_array_out,
lwe_output_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log,
level_count, lwe_offset, num_many_lut, lut_stride);
};
}
ctx.finalize();
}
// Verify if the grid size satisfies the cooperative group constraints

View File

@@ -16,8 +16,11 @@
#include "polynomial/polynomial_math.cuh"
#include "programmable_bootstrap.cuh"
#include "types/complex/operations.cuh"
#include <cuda/experimental/stf.cuh>
#include <vector>
namespace cudastf = cuda::experimental::stf;
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_tbc_accumulate(
@@ -404,23 +407,44 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
uint32_t num_many_lut, uint32_t lut_stride) {
cuda_set_device(gpu_index);
// Generate a CUDA graph if the USE_CUDA_GRAPH is set to a non-null value
const char *use_graph_env = getenv("USE_CUDA_GRAPH");
cudastf::context ctx(stream);
if (use_graph_env && atoi(use_graph_env) != 0) {
ctx = cudastf::graph_ctx(stream);
}
auto lwe_chunk_size = buffer->lwe_chunk_size;
auto buffer_token = ctx.logical_token();
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {
auto key_token = ctx.logical_token();
auto result_token = ctx.logical_token();
// Compute a keybundle
execute_compute_keybundle<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
ctx.task(key_token.write(), buffer_token.write())
.set_symbol("compute_keybundle")
->*[&](cudaStream_t stf_stream) {
execute_compute_keybundle<Torus, params>(
stf_stream, gpu_index, lwe_array_in, lwe_input_indexes,
bootstrapping_key, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor,
level_count, lwe_offset);
};
// Accumulate
execute_tbc_external_product_loop<Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
lut_stride);
ctx.task(key_token.read(), buffer_token.rw(), result_token.write())
.set_symbol("accumulate")
->*
[&](cudaStream_t stf_stream) {
execute_tbc_external_product_loop<Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
lut_stride);
};
}
}