mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 15:48:20 -05:00
Compare commits
5 Commits
al/vectori
...
as/cuda_st
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
70fa68bf52 | ||
|
|
7faafd6602 | ||
|
|
8c55f6b8d7 | ||
|
|
f6b1929a8d | ||
|
|
87c0d646a4 |
77
.github/workflows/coprocessor-benchmark-gpu.yml
vendored
77
.github/workflows/coprocessor-benchmark-gpu.yml
vendored
@@ -3,6 +3,22 @@ name: coprocessor-benchmark-gpu
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
profile:
|
||||
description: "Instance type"
|
||||
required: true
|
||||
type: choice
|
||||
options:
|
||||
- "l40 (n3-L40x1)"
|
||||
- "4-l40 (n3-L40x4)"
|
||||
- "single-h100 (n3-H100x1)"
|
||||
- "2-h100 (n3-H100x2)"
|
||||
- "4-h100 (n3-H100x4)"
|
||||
- "multi-h100 (n3-H100x8)"
|
||||
- "multi-h100-nvlink (n3-H100x8-NVLink)"
|
||||
- "multi-h100-sxm5 (n3-H100x8-SXM5)"
|
||||
- "multi-h100-sxm5_fallback (n3-H100x8-SXM5)"
|
||||
|
||||
schedule:
|
||||
# Weekly tests @ 1AM
|
||||
- cron: "0 1 * * 6"
|
||||
@@ -17,7 +33,9 @@ env:
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
PROFILE: "multi-h100-sxm5 (n3-H100x8-SXM5)"
|
||||
PROFILE_SCHEDULED_RUN: "multi-h100-sxm5 (n3-H100x8-SXM5)"
|
||||
PROFILE_MANUAL_RUN: ${{ inputs.profile }}
|
||||
IS_MANUAL_RUN: ${{ github.event_name == 'workflow_dispatch' }}
|
||||
BENCHMARK_TYPE: "ALL"
|
||||
OPTIMIZATION_TARGET: "throughput"
|
||||
BATCH_SIZE: "5000"
|
||||
@@ -40,15 +58,25 @@ jobs:
|
||||
- name: Parse profile
|
||||
id: parse_profile
|
||||
run: |
|
||||
if [[ ${IS_MANUAL_RUN} == true ]]; then
|
||||
PROFILE_RAW="${PROFILE_MANUAL_RUN}"
|
||||
else
|
||||
PROFILE_RAW="${PROFILE_SCHEDULED_RUN}"
|
||||
fi
|
||||
# shellcheck disable=SC2001
|
||||
PROFILE_VAL=$(echo "${PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
|
||||
PROFILE_VAL=$(echo "${PROFILE_RAW}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
|
||||
echo "profile=$PROFILE_VAL" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Parse hardware name
|
||||
id: parse_hardware_name
|
||||
run: |
|
||||
if [[ ${IS_MANUAL_RUN} == true ]]; then
|
||||
PROFILE_RAW="${PROFILE_MANUAL_RUN}"
|
||||
else
|
||||
PROFILE_RAW="${PROFILE}"
|
||||
fi
|
||||
# shellcheck disable=SC2001
|
||||
PROFILE_VAL=$(echo "${PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
|
||||
PROFILE_VAL=$(echo "${PROFILE_RAW}" | sed 's|.*[[:space:]](\(.*\))|\1|')
|
||||
echo "name=$PROFILE_VAL" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
setup-instance:
|
||||
@@ -130,6 +158,13 @@ jobs:
|
||||
} >> "${GITHUB_ENV}"
|
||||
working-directory: tfhe-rs/
|
||||
|
||||
- name: Setup Hyperstack dependencies
|
||||
uses: ./tfhe-rs/.github/actions/gpu_setup
|
||||
with:
|
||||
cuda-version: ${{ matrix.cuda }}
|
||||
gcc-version: ${{ matrix.gcc }}
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Check fhEVM and TFHE-rs repos
|
||||
run: |
|
||||
pwd
|
||||
@@ -140,13 +175,6 @@ jobs:
|
||||
run: git lfs checkout
|
||||
working-directory: fhevm/
|
||||
|
||||
- name: Setup Hyperstack dependencies
|
||||
uses: ./fhevm/.github/actions/gpu_setup
|
||||
with:
|
||||
cuda-version: ${{ matrix.cuda }}
|
||||
gcc-version: ${{ matrix.gcc }}
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
@@ -154,7 +182,7 @@ jobs:
|
||||
|
||||
- name: Install cargo dependencies
|
||||
run: |
|
||||
sudo apt-get install -y protobuf-compiler cmake pkg-config libssl-dev \
|
||||
sudo apt-get install -y protobuf-compiler pkg-config libssl-dev \
|
||||
libclang-dev docker-compose-v2 docker.io acl
|
||||
sudo usermod -aG docker "$USER"
|
||||
newgrp docker
|
||||
@@ -181,9 +209,16 @@ jobs:
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Login to Chainguard Registry
|
||||
uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
|
||||
with:
|
||||
registry: cgr.dev
|
||||
username: ${{ secrets.CGR_USERNAME }}
|
||||
password: ${{ secrets.CGR_PASSWORD }}
|
||||
|
||||
- name: Init database
|
||||
run: make init_db
|
||||
working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
|
||||
working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker
|
||||
|
||||
- name: Use Node.js
|
||||
uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
|
||||
@@ -203,8 +238,12 @@ jobs:
|
||||
|
||||
- name: Profile erc20 no-cmux benchmark on GPU
|
||||
run: |
|
||||
BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" BENCHMARK_TYPE="LATENCY" OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" make -e "profile_erc20_gpu"
|
||||
working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
|
||||
BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" \
|
||||
FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" \
|
||||
BENCHMARK_TYPE="THROUGHPUT_200" \
|
||||
OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" \
|
||||
make -e "profile_erc20_gpu"
|
||||
working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker
|
||||
|
||||
- name: Get nsys profile name
|
||||
id: nsys_profile_name
|
||||
@@ -215,7 +254,7 @@ jobs:
|
||||
REPORT_NAME: ${{ steps.nsys_profile_name.outputs.profile }}
|
||||
run: |
|
||||
mv report1.nsys-rep ${{ env.REPORT_NAME }}
|
||||
working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
|
||||
working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker
|
||||
|
||||
- name: Upload profile artifact
|
||||
env:
|
||||
@@ -223,17 +262,17 @@ jobs:
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
|
||||
with:
|
||||
name: ${{ env.REPORT_NAME }}
|
||||
path: fhevm/coprocessor/fhevm-engine/coprocessor/${{ env.REPORT_NAME }}
|
||||
path: fhevm/coprocessor/fhevm-engine/tfhe-worker/${{ env.REPORT_NAME }}
|
||||
|
||||
- name: Run latency benchmark on GPU
|
||||
run: |
|
||||
BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" BENCHMARK_TYPE="LATENCY" OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" make -e "benchmark_${BENCHMARKS}_gpu"
|
||||
working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
|
||||
working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker
|
||||
|
||||
- name: Run throughput benchmarks on GPU
|
||||
run: |
|
||||
BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" BENCHMARK_TYPE="THROUGHPUT_200" OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" make -e "benchmark_${BENCHMARKS}_gpu"
|
||||
working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
|
||||
working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -246,7 +285,7 @@ jobs:
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${BENCH_DATE}" \
|
||||
--walk-subdirs \
|
||||
--crate "coprocessor/fhevm-engine/coprocessor" \
|
||||
--crate "coprocessor/fhevm-engine/tfhe-worker" \
|
||||
--name-suffix "operation_batch_size_${BATCH_SIZE}-schedule_${SCHEDULING_POLICY}-optimization_target_${OPTIMIZATION_TARGET}"
|
||||
working-directory: fhevm/
|
||||
|
||||
|
||||
@@ -86,7 +86,7 @@ jobs:
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: multi-gpu-test
|
||||
profile: 4-l40
|
||||
|
||||
# This instance will be spawned especially for pull-request from forked repository
|
||||
- name: Start GitHub instance
|
||||
|
||||
@@ -43,7 +43,7 @@ jobs:
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: multi-gpu-test
|
||||
profile: 4-l40
|
||||
|
||||
cuda-tests:
|
||||
name: gpu_integer_long_run_tests/cuda-tests
|
||||
|
||||
5
Makefile
5
Makefile
@@ -1004,6 +1004,11 @@ test_list_gpu: install_rs_build_toolchain install_cargo_nextest
|
||||
--features=integer,internal-keycache,gpu,zk-pok -p tfhe \
|
||||
-E "test(/.*gpu.*/)"
|
||||
|
||||
.PHONY: build_one_hl_api_test_gpu
|
||||
build_one_hl_api_test_gpu: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
|
||||
--features=integer,gpu-debug -vv -p tfhe -- "$${TEST}" --test-threads=1 --nocapture
|
||||
|
||||
test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
|
||||
ifeq ($(HPU_CONFIG), v80)
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
|
||||
|
||||
@@ -86,6 +86,7 @@ if(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debug")
|
||||
message("Compiling in Debug mode")
|
||||
add_definitions(-DDEBUG)
|
||||
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -G -g")
|
||||
set(USE_NVTOOLS 1)
|
||||
else()
|
||||
# Release mode
|
||||
message("Compiling in Release mode")
|
||||
|
||||
@@ -6,6 +6,15 @@
|
||||
#include <cstdlib>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#define CUDA_STREAM_POOL
|
||||
|
||||
enum CudaStreamType
|
||||
{
|
||||
KEY = 0,
|
||||
ALLOC = 1,
|
||||
TEMP_HELPER = 2,
|
||||
};
|
||||
|
||||
extern "C" {
|
||||
|
||||
#define check_cuda_error(ans) \
|
||||
|
||||
@@ -1,15 +1,27 @@
|
||||
#include "device.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
#include <mutex>
|
||||
#ifdef USE_NVTOOLS
|
||||
#include <cuda_profiler_api.h>
|
||||
#endif
|
||||
|
||||
#ifdef CUDA_STREAM_POOL
|
||||
#include <deque>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#endif
|
||||
|
||||
uint32_t cuda_get_device() {
|
||||
int device;
|
||||
check_cuda_error(cudaGetDevice(&device));
|
||||
return static_cast<uint32_t>(device);
|
||||
}
|
||||
std::mutex pool_mutex;
|
||||
bool mem_pools_enabled = false;
|
||||
|
||||
std::atomic<bool> mem_pools_enabled = false;
|
||||
|
||||
// We use memory pools to reduce some overhead of memory allocations due
|
||||
// to our scratch/release pattern. This function is the simplest way of using
|
||||
@@ -26,13 +38,13 @@ bool mem_pools_enabled = false;
|
||||
// We tested more complex configurations of mempools, but they did not yield
|
||||
// better results.
|
||||
void cuda_setup_mempool(uint32_t caller_gpu_index) {
|
||||
if (!mem_pools_enabled) {
|
||||
pool_mutex.lock();
|
||||
if (mem_pools_enabled)
|
||||
return; // If mem pools are already enabled, we don't need to do anything
|
||||
|
||||
// We do it only once for all GPUs
|
||||
mem_pools_enabled = true;
|
||||
bool pools_not_initialized = false;
|
||||
bool pools_initialized = true;
|
||||
|
||||
// if pools_not_initialized is found, mem_pools_enabled is set to pools_initialized
|
||||
// and the if body runs
|
||||
if (mem_pools_enabled.compare_exchange_strong(pools_not_initialized, pools_initialized)) {
|
||||
uint32_t num_gpus = cuda_get_number_of_gpus();
|
||||
for (uint32_t gpu_index = 0; gpu_index < num_gpus; gpu_index++) {
|
||||
cuda_set_device(gpu_index);
|
||||
@@ -75,7 +87,6 @@ void cuda_setup_mempool(uint32_t caller_gpu_index) {
|
||||
}
|
||||
// We return to the original gpu_index
|
||||
cuda_set_device(caller_gpu_index);
|
||||
pool_mutex.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,6 +94,9 @@ void cuda_set_device(uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
// Mempools are initialized only once in all the GPUS available
|
||||
cuda_setup_mempool(gpu_index);
|
||||
#ifdef USE_NVTOOLS
|
||||
check_cuda_error(cudaProfilerStart());
|
||||
#endif
|
||||
}
|
||||
|
||||
cudaEvent_t cuda_create_event(uint32_t gpu_index) {
|
||||
@@ -109,18 +123,90 @@ void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index) {
|
||||
check_cuda_error(cudaEventDestroy(event));
|
||||
}
|
||||
|
||||
#ifdef CUDA_STREAM_POOL
|
||||
struct CudaBoundStream
|
||||
{
|
||||
cudaStream_t stream;
|
||||
uint32_t gpu_index;
|
||||
};
|
||||
|
||||
class CudaStreamPool
|
||||
{
|
||||
std::vector<CudaBoundStream> poolCompute;
|
||||
std::vector<CudaBoundStream> poolTransfer;
|
||||
|
||||
std::mutex mutex_pools;
|
||||
|
||||
size_t nextStream = 0;
|
||||
|
||||
const size_t MAX_STREAMS = 8;
|
||||
|
||||
public:
|
||||
cudaStream_t create_stream(uint32_t gpu_index)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_pools);
|
||||
if (poolCompute.empty())
|
||||
{
|
||||
poolCompute.reserve(MAX_STREAMS);
|
||||
|
||||
cuda_set_device(gpu_index);
|
||||
for (size_t i = 0; i < MAX_STREAMS; i++)
|
||||
{
|
||||
cudaStream_t stream;
|
||||
check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
poolCompute.push_back(CudaBoundStream{stream, gpu_index});
|
||||
}
|
||||
}
|
||||
|
||||
PANIC_IF_FALSE(gpu_index == poolCompute[nextStream].gpu_index, "Bad gpu in stream pool");
|
||||
cudaStream_t res = poolCompute[nextStream].stream;
|
||||
nextStream = (nextStream + 1) % poolCompute.size();
|
||||
return res;
|
||||
}
|
||||
|
||||
void destroy_stream(cudaStream_t stream, uint32_t gpu_index)
|
||||
{
|
||||
//do nothing
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class CudaMultiStreamPool {
|
||||
std::unordered_map<uint32_t, CudaStreamPool> per_gpu_pools;
|
||||
std::mutex pools_mutex; // for creation of the mem managers
|
||||
|
||||
public:
|
||||
CudaStreamPool &get(uint32_t gpu_index) {
|
||||
std::lock_guard<std::mutex> guard(pools_mutex);
|
||||
return per_gpu_pools[gpu_index]; // creates it if it does not exist
|
||||
}
|
||||
};
|
||||
|
||||
CudaMultiStreamPool gCudaStreamPool;
|
||||
#endif
|
||||
|
||||
|
||||
/// Unsafe function to create a CUDA stream, must check first that GPU exists
|
||||
cudaStream_t cuda_create_stream(uint32_t gpu_index) {
|
||||
#ifdef CUDA_STREAM_POOL
|
||||
cuda_set_device(gpu_index); // this will initialize the mempool
|
||||
return gCudaStreamPool.get(gpu_index).create_stream(gpu_index);
|
||||
#else
|
||||
cuda_set_device(gpu_index);
|
||||
cudaStream_t stream;
|
||||
check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
return stream;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Unsafe function to destroy CUDA stream, must check first the GPU exists
|
||||
void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index) {
|
||||
#ifdef CUDA_STREAM_POOL
|
||||
gCudaStreamPool.get(gpu_index).destroy_stream(stream, gpu_index);
|
||||
#else
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(cudaStreamDestroy(stream));
|
||||
#endif
|
||||
}
|
||||
|
||||
void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
|
||||
|
||||
18
ci/slab.toml
18
ci/slab.toml
@@ -83,18 +83,6 @@ image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
|
||||
flavor_name = "n3-A100x8-NVLink"
|
||||
user = "ubuntu"
|
||||
|
||||
[backend.hyperstack.multi-gpu-test]
|
||||
environment_name = "canada"
|
||||
image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
|
||||
flavor_name = "n3-L40x4"
|
||||
user = "ubuntu"
|
||||
|
||||
[backend.hyperstack.multi-gpu-test_fallback]
|
||||
environment_name = "canada"
|
||||
image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
|
||||
flavor_name = "n3-RTX-A6000x2"
|
||||
user = "ubuntu"
|
||||
|
||||
[backend.hyperstack.l40]
|
||||
environment_name = "canada"
|
||||
image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
|
||||
@@ -106,3 +94,9 @@ environment_name = "canada"
|
||||
image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
|
||||
flavor_name = "n3-RTX-A6000x1"
|
||||
user = "ubuntu"
|
||||
|
||||
[backend.hyperstack.4-l40]
|
||||
environment_name = "canada"
|
||||
image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
|
||||
flavor_name = "n3-L40x4"
|
||||
user = "ubuntu"
|
||||
|
||||
Reference in New Issue
Block a user