docs(gpu): regroup gpu docs

docs(gpu): a small fix
2026-01-09 14:47:56 -05:00 · 2025-04-10 10:55:02 +02:00
parent 66ac6f8762
commit 851dd7c171
11 changed files with 361 additions and 342 deletions
--- a/tfhe/docs/README.md
+++ b/tfhe/docs/README.md
@@ -28,7 +28,7 @@ Learn the basics of TFHE-rs, set it up, and make it run with ease.

 Start building with TFHE-rs by exploring its core features, discovering essential guides, and learning more with user-friendly tutorials.

-<table data-view="cards"><thead><tr><th></th><th></th><th></th><th data-hidden data-card-cover data-type="files"></th></tr></thead><tbody><tr><td><strong>FHE Computations</strong></td><td>Run FHE computation on encrypted data.</td><td><ul><li><a href="fhe-computation/types/">Types </a></li><li><a href="fhe-computation/operations/">Operations</a></li></ul></td><td><a href=".gitbook/assets/build1.png">build1.png</a></td></tr><tr><td><strong>Configuration</strong></td><td>Advanced configuration for better performance.</td><td><ul><li><a href="configuration/rust_configuration.md">Advanced Rust </a></li><li><a href="configuration/run_on_gpu.md">GPU acceleration</a></li></ul></td><td><a href=".gitbook/assets/build2.png">build2.png</a></td></tr><tr><td><strong>Integration</strong></td><td>Use TFHE-rs in different contexts or platforms..</td><td><ul><li><a href="integration/c_api.md">C API</a></li><li><a href="integration/js_on_wasm_api.md">JS on WASM API</a></li></ul></td><td><a href=".gitbook/assets/build3.png">build3.png</a></td></tr></tbody></table>
+<table data-view="cards"><thead><tr><th></th><th></th><th></th><th data-hidden data-card-cover data-type="files"></th></tr></thead><tbody><tr><td><strong>FHE Computations</strong></td><td>Run FHE computation on encrypted data.</td><td><ul><li><a href="fhe-computation/types/">Types </a></li><li><a href="fhe-computation/operations/">Operations</a></li></ul></td><td><a href=".gitbook/assets/build1.png">build1.png</a></td></tr><tr><td><strong>Configuration</strong></td><td>Advanced configuration for better performance.</td><td><ul><li><a href="configuration/rust_configuration.md">Advanced Rust </a></li><li><a href="configuration/gpu_acceleration/run_on_gpu.md">GPU acceleration</a></li></ul></td><td><a href=".gitbook/assets/build2.png">build2.png</a></td></tr><tr><td><strong>Integration</strong></td><td>Use TFHE-rs in different contexts or platforms..</td><td><ul><li><a href="integration/c_api.md">C API</a></li><li><a href="integration/js_on_wasm_api.md">JS on WASM API</a></li></ul></td><td><a href=".gitbook/assets/build3.png">build3.png</a></td></tr></tbody></table>

 ## Explore more

--- a/tfhe/docs/SUMMARY.md
+++ b/tfhe/docs/SUMMARY.md
@@ -58,7 +58,12 @@
 ## Configuration

 * [Advanced Rust setup](configuration/rust_configuration.md)
-* [GPU acceleration](configuration/run_on_gpu.md)
+* [GPU acceleration](configuration/gpu_acceleration/run_on_gpu.md)
+  * [Operations](configuration/gpu_acceleration/gpu_operations.md)
+  * [Benchmark](configuration/gpu_acceleration/benchmark.md)
+  * [Compressing ciphertexts](configuration/gpu_acceleration/compressing_ciphertexts.md)
+  * [Array types](configuration/gpu_acceleration/array_type.md)
+  * [Multi-GPU support](configuration/gpu_acceleration/multi_gpu.md)
 * [Parallelized PBS](configuration/parallelized_pbs.md)

 ## Integration
--- a/tfhe/docs/configuration/gpu_acceleration/array_type.md
+++ b/tfhe/docs/configuration/gpu_acceleration/array_type.md
@@ -0,0 +1,76 @@
+# Array types
+This document explains how to use array types on GPU, just as [on CPU](../../fhe-computation/types/array.md).
+
+Here is an example:
+
+```rust
+use tfhe::{ConfigBuilder, set_server_key, ClearArray, ClientKey, CompressedServerKey};
+use tfhe::array::GpuFheUint32Array;
+use tfhe::prelude::*;
+
+fn main() {
+    let config = ConfigBuilder::default().build();
+
+    let cks = ClientKey::generate(config);
+    let compressed_server_key = CompressedServerKey::new(&cks);
+
+    let gpu_key = compressed_server_key.decompress_to_gpu();
+    set_server_key(gpu_key);
+
+    let num_elems = 4 * 4;
+    let clear_xs = (0..num_elems as u32).collect::<Vec<_>>();
+    let clear_ys = vec![1u32; num_elems];
+
+    // Encrypted 2D array with values
+    // [[  0,  1,  2,  3]
+    //  [  4,  5,  6,  7]
+    //  [  8,  9, 10, 11]
+    //  [ 12, 13, 14, 15]]
+    let xs = GpuFheUint32Array::try_encrypt((clear_xs.as_slice(), vec![4, 4]), &cks).unwrap();
+    // Encrypted 2D array with values
+    // [[  1,  1,  1,  1]
+    //  [  1,  1,  1,  1]
+    //  [  1,  1,  1,  1]
+    //  [  1,  1,  1,  1]]
+    let ys = GpuFheUint32Array::try_encrypt((clear_ys.as_slice(), vec![4, 4]), &cks).unwrap();
+
+    assert_eq!(xs.num_dim(), 2);
+    assert_eq!(xs.shape(), &[4, 4]);
+    assert_eq!(ys.num_dim(), 2);
+    assert_eq!(ys.shape(), &[4, 4]);
+
+    // Take a sub slice
+    //  [[ 10, 11]
+    //   [ 14, 15]]
+    let xss = xs.slice(&[2..4, 2..4]);
+    // Take a sub slice
+    //  [[  1,  1]
+    //   [  1,  1]]
+    let yss = ys.slice(&[2..4, 2..4]);
+
+    assert_eq!(xss.num_dim(), 2);
+    assert_eq!(xss.shape(), &[2, 2]);
+    assert_eq!(yss.num_dim(), 2);
+    assert_eq!(yss.shape(), &[2, 2]);
+
+    let r = &xss + &yss;
+
+    // Result is
+    //  [[ 11, 12]
+    //   [ 15, 16]]
+    let result: Vec<u32> = r.decrypt(&cks);
+    assert_eq!(result, vec![11, 12, 15, 16]);
+
+    // Clear 2D array with values
+    //  [[  10,  20]
+    //   [  30,  40]]
+    let clear_array = ClearArray::new(vec![10u32, 20u32, 30u32, 40u32], vec![2, 2]);
+    let r = &xss + &clear_array;
+
+    // Result is
+    //  [[ 20, 31]
+    //   [ 44, 55]]
+    let r: Vec<u32> = r.decrypt(&cks);
+    assert_eq!(r, vec![20, 31, 44, 55]);
+}
+```
--- a/tfhe/docs/configuration/gpu_acceleration/benchmark.md
+++ b/tfhe/docs/configuration/gpu_acceleration/benchmark.md
@@ -0,0 +1,7 @@
+# Benchmarks
+
+Please refer to the [GPU benchmarks](../../getting_started/benchmarks/gpu/README.md) for detailed performance benchmark results.
+
+{% hint style="warning" %}
+When measuring GPU times on your own on Linux, set the environment variable `CUDA_MODULE_LOADING=EAGER` to avoid CUDA API overheads during the first kernel execution.
+{% endhint %}
--- a/tfhe/docs/configuration/gpu_acceleration/compressing_ciphertexts.md
+++ b/tfhe/docs/configuration/gpu_acceleration/compressing_ciphertexts.md
@@ -0,0 +1,71 @@
+# Compressing ciphertexts
+
+This document explains how to compress ciphertexts using the GPU - even after homomorphic computations - just like on the [CPU](../../fhe-computation/data-handling/compress.md#compression-ciphertexts-after-some-homomorphic-computation).
+
+Compressing ciphertexts after computation using GPU is very similar to how it's done on the CPU. The following example shows how to compress and decompress a list containing 4 messages:
+
+* One 32-bits integer
+* One 64-bit integer
+* One Boolean
+* One 2-bit integer
+
+```rust
+use tfhe::prelude::*;
+use tfhe::shortint::parameters::{
+    COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS, PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS,
+};
+use tfhe::{
+    set_server_key, CompressedCiphertextList, CompressedCiphertextListBuilder, FheBool,
+    FheInt64, FheUint16, FheUint2, FheUint32,
+};
+
+fn main() {
+    let config =
+        tfhe::ConfigBuilder::with_custom_parameters(PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS)
+            .enable_compression(COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS)
+            .build();
+
+    let ck = tfhe::ClientKey::generate(config);
+    let compressed_server_key = tfhe::CompressedServerKey::new(&ck);
+    let gpu_key = compressed_server_key.decompress_to_gpu();
+
+    set_server_key(gpu_key);
+
+    let ct1 = FheUint32::encrypt(17_u32, &ck);
+
+    let ct2 = FheInt64::encrypt(-1i64, &ck);
+
+    let ct3 = FheBool::encrypt(false, &ck);
+
+    let ct4 = FheUint2::encrypt(3u8, &ck);
+
+    let compressed_list = CompressedCiphertextListBuilder::new()
+        .push(ct1)
+        .push(ct2)
+        .push(ct3)
+        .push(ct4)
+        .build()
+        .unwrap();
+
+    let serialized = bincode::serialize(&compressed_list).unwrap();
+
+    println!("Serialized size: {} bytes", serialized.len());
+
+    let compressed_list: CompressedCiphertextList = bincode::deserialize(&serialized).unwrap();
+
+    let a: FheUint32 = compressed_list.get(0).unwrap().unwrap();
+    let b: FheInt64 = compressed_list.get(1).unwrap().unwrap();
+    let c: FheBool = compressed_list.get(2).unwrap().unwrap();
+    let d: FheUint2 = compressed_list.get(3).unwrap().unwrap();
+
+    let a: u32 = a.decrypt(&ck);
+    assert_eq!(a, 17);
+    let b: i64 = b.decrypt(&ck);
+    assert_eq!(b, -1);
+    let c = c.decrypt(&ck);
+    assert!(!c);
+    let d: u8 = d.decrypt(&ck);
+    assert_eq!(d, 3);
+
+}
+```
--- a/tfhe/docs/configuration/gpu_acceleration/gpu_operations.md
+++ b/tfhe/docs/configuration/gpu_acceleration/gpu_operations.md
@@ -0,0 +1,39 @@
+# GPU operations
+This document outlines the GPU operations supported in TFHE-rs. 
+
+The GPU backend includes the following operations for both signed and unsigned encrypted integers:
+
+| name                               | symbol                | `Enc`/`Enc`          | `Enc`/ `Int`               |
+|------------------------------------|-----------------------|----------------------|----------------------------|
+| Neg                                | `-`                   | :heavy\_check\_mark: | N/A                        |
+| Add                                | `+`                   | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Sub                                | `-`                   | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Mul                                | `*`                   | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Div                                | `/`                   | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Rem                                | `%`                   | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Not                                | `!`                   | :heavy\_check\_mark: | N/A                        |
+| BitAnd                             | `&`                   | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| BitOr                              | `\|`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| BitXor                             | `^`                   | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Shr                                | `>>`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Shl                                | `<<`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Rotate right                       | `rotate_right`        | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Rotate left                        | `rotate_left`         | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Min                                | `min`                 | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Max                                | `max`                 | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Greater than                       | `gt`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Greater or equal than              | `ge`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Lower than                         | `lt`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Lower or equal than                | `le`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Equal                              | `eq`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Not Equal                          | `ne`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Cast (into dest type)              | `cast_into`           | :heavy\_check\_mark: | N/A                        |
+| Cast (from src type)               | `cast_from`           | :heavy\_check\_mark: | N/A                        |
+| Ternary operator                   | `select`              | :heavy\_check\_mark: | :heavy\_multiplication\_x: |
+| Integer logarithm                  | `ilog2`               | :heavy\_check\_mark: | N/A                        |
+| Count trailing/leading zeros/ones  | `count_leading_zeros` | :heavy\_check\_mark: | N/A                        |
+| Oblivious Pseudo Random Generation | `oprf`                | :heavy\_check\_mark: | N/A                        |
+
+{% hint style="info" %}
+All operations follow the same syntax as the one described in [here](../../fhe-computation/operations/README.md).
+{% endhint %}
--- a/tfhe/docs/configuration/gpu_acceleration/multi_gpu.md
+++ b/tfhe/docs/configuration/gpu_acceleration/multi_gpu.md
@@ -1,7 +1,16 @@
-This guide walks through a practical example of performing a large batch of encrypted 64-bit additions using manual GPU 
+# Multi-GPU support
+This guide explains the multi GPU support of TFHE-rs, and walks through a practical example of performing a large batch of encrypted 64-bit additions using manual GPU 
 dispatching to improve the performance.

-# Improving throughput on multiple-GPUs
+## Multi-GPU support overview
+
+TFHE-rs supports platforms with multiple GPUs. There is **nothing to change in the code to execute on such platforms**. To keep the API as user-friendly as possible, the configuration is automatically set, i.e., the user has no fine-grained control over the number of GPUs to be used.
+However, you can decide to have operations be executed on a single GPU of your choice.
+In many cases this provides better throughput than using all the available GPUs to perform the operation.
+Indeed, except for integer precisions above 64 bits and for the multiplication, which involves many bootstrap computations in parallel, most operations on up to 64 bits do not necessitate the full power of a GPU.
+You will then be able to maximize throughput on multiple GPUs with TFHE-rs.
+
+## Improving throughput on multiple-GPUs

 By default, when multiple GPUs are available on the machine, TFHE-rs automatically uses them all
 to perform encrypted operations. Under the hood, it includes a hard-coded logic to dispatch work across all the GPUs and to copy essential data—like the server key—to each GPU.
@@ -9,13 +18,13 @@ This approach is efficient for operations that load the GPU extensively (e.g. th
 but not so much for smaller operations like the encrypted addition or comparison on 64-bits.
 To address this, TFHE-rs also provides a mechanism to manually select which GPU to operate on.

-## Dispatch operations on the GPUs of your choice
+### Dispatch operations on the GPUs of your choice

 When selecting a specific GPU to execute on, there are two essential requirements that are different from a default GPU execution:
 - You must create a GPU server key on each GPU individually.
 - The batch of operations must be distributed on all the GPUs manually.

-### Step 1: Decompress the server key to each GPU
+#### Step 1: Decompress the server key to each GPU
 Instead of a single server key being used across all GPUs automatically, you’ll need specifically decompress the server key to each GPU, so that the key is available in memory.
 For example, by default, the GPU server key is decompressed and loaded onto all available GPUs automatically as follows:
 ```rust
@@ -51,7 +60,7 @@ fn main() {
        .collect::<Vec<_>>();
 }
 ```
-### Step 2: Define the inputs to operate on
+#### Step 2: Define the inputs to operate on
 We will be doing 100 additions in parallel on each GPU:
 ```rust
 use tfhe::{ConfigBuilder, set_server_key, ClientKey, CompressedServerKey, FheUint64, GpuIndex};
@@ -83,7 +92,7 @@ fn main() {
 ```
 At this stage, the left and right inputs reside on the CPU. They have not yet been copied to the GPU. 

-### Step3: Dispatch the workloads
+#### Step3: Dispatch the workloads
 Now you need to split the calculation into as many chunks as there are GPUs.
 TFHE-rs allows you to execute additions in parallel across multiple GPUs by leveraging [CUDA streams](https://developer.nvidia.com/blog/gpu-pro-tip-cuda-7-streams-simplify-concurrency/). 
 CUDA stream management is not explicit in the High-Level(HL) API of TFHE-rs: streams are implicitly 
@@ -142,7 +151,7 @@ In this example, `par_chunks` divides the input vectors into `num_gpus` chunks
 It’s important to note that, in this example, when using the `+` operator on encrypted inputs, data is first transferred from the CPU to the GPU before computation, the result then resides on the GPU `i`.
 You can learn more about how to inspect on which GPU a piece of data resides from the examples in this file: `tfhe/src/high_level_api/tests/gpu_selection.rs`.

-## Going beyond: Restrict the number of CUDA streams
+### Going beyond: Restrict the number of CUDA streams

 While the behavior of `.par_iter()` in TFHE-rs' HL API aligns with expectations and provides parallelism over encrypted data, it can become a performance bottleneck in some cases. This is due to the way CUDA streams are managed.
 CUDA streams allow for parallel execution on the GPU, but when too many are created, scheduling becomes inefficient. Instead of running in parallel, operations may fall back to sequential execution. In practice, having more than 10 streams already starts to negatively impact throughput.
--- a/tfhe/docs/configuration/gpu_acceleration/run_on_gpu.md
+++ b/tfhe/docs/configuration/gpu_acceleration/run_on_gpu.md
@@ -0,0 +1,124 @@
+# GPU acceleration
+
+This guide explains how to update your existing program to leverage GPU acceleration, or to start a new program using GPU.
+
+**TFHE-rs** now supports a GPU backend with CUDA implementation, enabling integer arithmetic operations on encrypted data.
+
+## Prerequisites
+
+* Cuda version >= 10
+* Compute Capability >= 3.0
+* [gcc](https://gcc.gnu.org/) >= 8.0 - check this [page](https://gist.github.com/ax3l/9489132) for more details about nvcc/gcc compatible versions
+* [cmake](https://cmake.org/) >= 3.24
+* libclang, to match Rust bingen [requirements](https://rust-lang.github.io/rust-bindgen/requirements.html) >= 9.0
+* Rust version - check this [page](../rust_configuration.md)
+
+## Importing to your project
+
+To use the **TFHE-rs** GPU backend in your project, add the following dependency in your `Cargo.toml`.
+
+```toml
+tfhe = { version = "~1.1.0", features = ["boolean", "shortint", "integer", "gpu"] }
+```
+
+{% hint style="success" %}
+For optimal performance when using **TFHE-rs**, run your code in release mode with the `--release` flag.
+{% endhint %}
+
+### Supported platforms
+
+**TFHE-rs** GPU backend is supported on Linux (x86, aarch64).
+
+| OS      | x86         | aarch64       |
+| ------- | ----------- | ------------- |
+| Linux   | Supported   | Supported\*   |
+| macOS   | Unsupported | Unsupported\* |
+| Windows | Unsupported | Unsupported   |
+
+## A first example
+
+### Configuring and creating keys.
+
+Comparing to the [CPU example](../../getting_started/quick_start.md), GPU set up differs in the key creation, as detailed [here](run\_on\_gpu.md#setting-the-keys)
+
+Here is a full example (combining the client and server parts):
+
+```rust
+use tfhe::{ConfigBuilder, set_server_key, FheUint8, ClientKey, CompressedServerKey};
+use tfhe::prelude::*;
+
+fn main() {
+
+    let config = ConfigBuilder::default().build();
+
+    let client_key= ClientKey::generate(config);
+    let compressed_server_key = CompressedServerKey::new(&client_key);
+
+    let gpu_key = compressed_server_key.decompress_to_gpu();
+
+    let clear_a = 27u8;
+    let clear_b = 128u8;
+
+    let a = FheUint8::encrypt(clear_a, &client_key);
+    let b = FheUint8::encrypt(clear_b, &client_key);
+
+    //Server-side
+
+    set_server_key(gpu_key);
+    let result = a + b;
+
+    //Client-side
+    let decrypted_result: u8 = result.decrypt(&client_key);
+
+    let clear_result = clear_a + clear_b;
+
+    assert_eq!(decrypted_result, clear_result);
+}
+```
+
+Beware that when the GPU feature is activated, when calling: `let config = ConfigBuilder::default().build();`, the cryptographic parameters differ from the CPU ones, used when the GPU feature is not activated. Indeed, TFHE-rs uses dedicated parameters for the GPU in order to achieve better performance.
+
+### Setting the keys
+
+The configuration of the key is different from the CPU. More precisely, if both client and server keys are still generated by the client (which is assumed to run on a CPU), the server key has then to be decompressed by the server to be converted into the right format. To do so, the server should run this function: `decompressed_to_gpu()`.
+
+Once decompressed, the operations between CPU and GPU are identical.
+
+### Encryption
+
+On the client-side, the method to encrypt the data is exactly the same than the CPU one, as shown in the following example:
+
+```Rust
+    let clear_a = 27u8;
+    let clear_b = 128u8;
+    
+    let a = FheUint8::encrypt(clear_a, &client_key);
+    let b = FheUint8::encrypt(clear_b, &client_key);
+```
+
+### Computation
+
+The server first need to set up its keys with `set_server_key(gpu_key)`.
+
+Then, homomorphic computations are performed using the same approach as the [CPU operations](../../fhe-computation/operations/README.md).
+
+```Rust
+    //Server-side
+    set_server_key(gpu_key);
+    let result = a + b;
+
+    //Client-side
+    let decrypted_result: u8 = result.decrypt(&client_key);
+
+    let clear_result = clear_a + clear_b;
+
+    assert_eq!(decrypted_result, clear_result);
+```
+
+### Decryption
+
+Finally, the client decrypts the results using:
+
+```Rust
+    let decrypted_result: u8 = result.decrypt(&client_key);
+```
--- a/tfhe/docs/configuration/run_on_gpu.md
+++ b/tfhe/docs/configuration/run_on_gpu.md
@@ -1,328 +0,0 @@
-# GPU acceleration
-
-This guide explains how to update your existing program to leverage GPU acceleration, or to start a new program using GPU.
-
-**TFHE-rs** now supports a GPU backend with CUDA implementation, enabling integer arithmetic operations on encrypted data.
-
-## Prerequisites
-
-* Cuda version >= 10
-* Compute Capability >= 3.0
-* [gcc](https://gcc.gnu.org/) >= 8.0 - check this [page](https://gist.github.com/ax3l/9489132) for more details about nvcc/gcc compatible versions
-* [cmake](https://cmake.org/) >= 3.24
-* libclang, to match Rust bingen [requirements](https://rust-lang.github.io/rust-bindgen/requirements.html) >= 9.0
-* Rust version - check this [page](rust_configuration.md)
-
-## Importing to your project
-
-To use the **TFHE-rs** GPU backend in your project, add the following dependency in your `Cargo.toml`.
-
-```toml
-tfhe = { version = "~1.1.0", features = ["boolean", "shortint", "integer", "gpu"] }
-```
-
-{% hint style="success" %}
-For optimal performance when using **TFHE-rs**, run your code in release mode with the `--release` flag.
-{% endhint %}
-
-### Supported platforms
-
-**TFHE-rs** GPU backend is supported on Linux (x86, aarch64).
-
-| OS      | x86         | aarch64       |
-| ------- | ----------- | ------------- |
-| Linux   | Supported   | Supported\*   |
-| macOS   | Unsupported | Unsupported\* |
-| Windows | Unsupported | Unsupported   |
-
-## A first example
-
-### Configuring and creating keys.
-
-Comparing to the [CPU example](../getting_started/quick_start.md), GPU set up differs in the key creation, as detailed [here](run\_on\_gpu.md#setting-the-keys)
-
-Here is a full example (combining the client and server parts):
-
-```rust
-use tfhe::{ConfigBuilder, set_server_key, FheUint8, ClientKey, CompressedServerKey};
-use tfhe::prelude::*;
-
-fn main() {
-
-    let config = ConfigBuilder::default().build();
-
-    let client_key= ClientKey::generate(config);
-    let compressed_server_key = CompressedServerKey::new(&client_key);
-
-    let gpu_key = compressed_server_key.decompress_to_gpu();
-
-    let clear_a = 27u8;
-    let clear_b = 128u8;
-
-    let a = FheUint8::encrypt(clear_a, &client_key);
-    let b = FheUint8::encrypt(clear_b, &client_key);
-
-    //Server-side
-
-    set_server_key(gpu_key);
-    let result = a + b;
-
-    //Client-side
-    let decrypted_result: u8 = result.decrypt(&client_key);
-
-    let clear_result = clear_a + clear_b;
-
-    assert_eq!(decrypted_result, clear_result);
-}
-```
-
-Beware that when the GPU feature is activated, when calling: `let config = ConfigBuilder::default().build();`, the cryptographic parameters differ from the CPU ones, used when the GPU feature is not activated. Indeed, TFHE-rs uses dedicated parameters for the GPU in order to achieve better performance.
-
-### Setting the keys
-
-The configuration of the key is different from the CPU. More precisely, if both client and server keys are still generated by the client (which is assumed to run on a CPU), the server key has then to be decompressed by the server to be converted into the right format. To do so, the server should run this function: `decompressed_to_gpu()`.
-
-Once decompressed, the operations between CPU and GPU are identical.
-
-### Encryption
-
-On the client-side, the method to encrypt the data is exactly the same than the CPU one, as shown in the following example:
-
-```Rust
-    let clear_a = 27u8;
-    let clear_b = 128u8;
-    
-    let a = FheUint8::encrypt(clear_a, &client_key);
-    let b = FheUint8::encrypt(clear_b, &client_key);
-```
-
-### Computation
-
-The server first need to set up its keys with `set_server_key(gpu_key)`.
-
-Then, homomorphic computations are performed using the same approach as the [CPU operations](../fhe-computation/operations/README.md).
-
-```Rust
-    //Server-side
-    set_server_key(gpu_key);
-    let result = a + b;
-
-    //Client-side
-    let decrypted_result: u8 = result.decrypt(&client_key);
-
-    let clear_result = clear_a + clear_b;
-
-    assert_eq!(decrypted_result, clear_result);
-```
-
-### Decryption
-
-Finally, the client decrypts the results using:
-
-```Rust
-    let decrypted_result: u8 = result.decrypt(&client_key);
-```
-
-## List of available operations
-
-The GPU backend includes the following operations for both signed and unsigned encrypted integers:
-
-| name                               | symbol                | `Enc`/`Enc`          | `Enc`/ `Int`               |
-|------------------------------------|-----------------------|----------------------|----------------------------|
-| Neg                                | `-`                   | :heavy\_check\_mark: | N/A                        |
-| Add                                | `+`                   | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Sub                                | `-`                   | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Mul                                | `*`                   | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Div                                | `/`                   | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Rem                                | `%`                   | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Not                                | `!`                   | :heavy\_check\_mark: | N/A                        |
-| BitAnd                             | `&`                   | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| BitOr                              | `\|`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| BitXor                             | `^`                   | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Shr                                | `>>`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Shl                                | `<<`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Rotate right                       | `rotate_right`        | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Rotate left                        | `rotate_left`         | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Min                                | `min`                 | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Max                                | `max`                 | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Greater than                       | `gt`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Greater or equal than              | `ge`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Lower than                         | `lt`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Lower or equal than                | `le`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Equal                              | `eq`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Not Equal                          | `ne`                  | :heavy\_check\_mark: | :heavy\_check\_mark:       |
-| Cast (into dest type)              | `cast_into`           | :heavy\_check\_mark: | N/A                        |
-| Cast (from src type)               | `cast_from`           | :heavy\_check\_mark: | N/A                        |
-| Ternary operator                   | `select`              | :heavy\_check\_mark: | :heavy\_multiplication\_x: |
-| Integer logarithm                  | `ilog2`               | :heavy\_check\_mark: | N/A                        |
-| Count trailing/leading zeros/ones  | `count_leading_zeros` | :heavy\_check\_mark: | N/A                        |
-| Oblivious Pseudo Random Generation | `oprf`                | :heavy\_check\_mark: | N/A                        |
-
-{% hint style="info" %}
-All operations follow the same syntax than the one described in [here](../fhe-computation/operations/README.md).
-{% endhint %}
-
-## Multi-GPU support
-
-TFHE-rs supports platforms with multiple GPUs. There is **nothing to change in the code to execute on such platforms**. To keep the API as user-friendly as possible, the configuration is automatically set, i.e., the user has no fine-grained control over the number of GPUs to be used.
-However, you can decide to have operations be executed on a single GPU of your choice.
-In many cases this provides better throughput than using all the available GPUs to perform the operation.
-Indeed, except for integer precisions above 64 bits and for the multiplication, which involves many bootstrap computations in parallel, most operations on up to 64 bits do not necessitate the full power of a GPU.
-To go further, you can learn how to select specific GPUs to perform batches of operations in this [tutorial](../tutorials/multi_gpu_device_selection.md).
-You will then be able to maximize throughput on multiple GPUs with TFHE-rs.
-
-## Benchmark
-
-Please refer to the [GPU benchmarks](../getting_started/benchmarks/gpu/README.md) for detailed performance benchmark results.
-
-## Warning
-
-When measuring GPU times on your own on Linux, set the environment variable `CUDA_MODULE_LOADING=EAGER` to avoid CUDA API overheads during the first kernel execution.
-
-## Compressing ciphertexts after some homomorphic computation on the GPU
-
-You can compress ciphertexts using the GPU, even after computations, just like on the [CPU](../fhe-computation/data-handling/compress.md#compression-ciphertexts-after-some-homomorphic-computation).
-
-The way to do it is very similar to how it's done on the CPU. The following example shows how to compress and decompress a list containing 4 messages:
-
-* One 32-bits integer
-* One 64-bit integer
-* One Boolean
-* One 2-bit integer
-
-```rust
-use tfhe::prelude::*;
-use tfhe::shortint::parameters::{
-    COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS, PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS,
-};
-use tfhe::{
-    set_server_key, CompressedCiphertextList, CompressedCiphertextListBuilder, FheBool,
-    FheInt64, FheUint16, FheUint2, FheUint32,
-};
-
-fn main() {
-    let config =
-        tfhe::ConfigBuilder::with_custom_parameters(PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS)
-            .enable_compression(COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS)
-            .build();
-
-    let ck = tfhe::ClientKey::generate(config);
-    let compressed_server_key = tfhe::CompressedServerKey::new(&ck);
-    let gpu_key = compressed_server_key.decompress_to_gpu();
-
-    set_server_key(gpu_key);
-
-    let ct1 = FheUint32::encrypt(17_u32, &ck);
-
-    let ct2 = FheInt64::encrypt(-1i64, &ck);
-
-    let ct3 = FheBool::encrypt(false, &ck);
-
-    let ct4 = FheUint2::encrypt(3u8, &ck);
-
-    let compressed_list = CompressedCiphertextListBuilder::new()
-        .push(ct1)
-        .push(ct2)
-        .push(ct3)
-        .push(ct4)
-        .build()
-        .unwrap();
-
-    let serialized = bincode::serialize(&compressed_list).unwrap();
-
-    println!("Serialized size: {} bytes", serialized.len());
-
-    let compressed_list: CompressedCiphertextList = bincode::deserialize(&serialized).unwrap();
-
-    let a: FheUint32 = compressed_list.get(0).unwrap().unwrap();
-    let b: FheInt64 = compressed_list.get(1).unwrap().unwrap();
-    let c: FheBool = compressed_list.get(2).unwrap().unwrap();
-    let d: FheUint2 = compressed_list.get(3).unwrap().unwrap();
-
-    let a: u32 = a.decrypt(&ck);
-    assert_eq!(a, 17);
-    let b: i64 = b.decrypt(&ck);
-    assert_eq!(b, -1);
-    let c = c.decrypt(&ck);
-    assert!(!c);
-    let d: u8 = d.decrypt(&ck);
-    assert_eq!(d, 3);
-
-}
-```
-
-## Array types
-
-It is possible to use array types on GPU, just as [on CPU](../fhe-computation/types/array.md). Here is an example showing how to do it:
-
-```rust
-use tfhe::{ConfigBuilder, set_server_key, ClearArray, ClientKey, CompressedServerKey};
-use tfhe::array::GpuFheUint32Array;
-use tfhe::prelude::*;
-
-fn main() {
-    let config = ConfigBuilder::default().build();
-
-    let cks = ClientKey::generate(config);
-    let compressed_server_key = CompressedServerKey::new(&cks);
-
-    let gpu_key = compressed_server_key.decompress_to_gpu();
-    set_server_key(gpu_key);
-
-    let num_elems = 4 * 4;
-    let clear_xs = (0..num_elems as u32).collect::<Vec<_>>();
-    let clear_ys = vec![1u32; num_elems];
-
-    // Encrypted 2D array with values
-    // [[  0,  1,  2,  3]
-    //  [  4,  5,  6,  7]
-    //  [  8,  9, 10, 11]
-    //  [ 12, 13, 14, 15]]
-    let xs = GpuFheUint32Array::try_encrypt((clear_xs.as_slice(), vec![4, 4]), &cks).unwrap();
-    // Encrypted 2D array with values
-    // [[  1,  1,  1,  1]
-    //  [  1,  1,  1,  1]
-    //  [  1,  1,  1,  1]
-    //  [  1,  1,  1,  1]]
-    let ys = GpuFheUint32Array::try_encrypt((clear_ys.as_slice(), vec![4, 4]), &cks).unwrap();
-
-    assert_eq!(xs.num_dim(), 2);
-    assert_eq!(xs.shape(), &[4, 4]);
-    assert_eq!(ys.num_dim(), 2);
-    assert_eq!(ys.shape(), &[4, 4]);
-
-    // Take a sub slice
-    //  [[ 10, 11]
-    //   [ 14, 15]]
-    let xss = xs.slice(&[2..4, 2..4]);
-    // Take a sub slice
-    //  [[  1,  1]
-    //   [  1,  1]]
-    let yss = ys.slice(&[2..4, 2..4]);
-
-    assert_eq!(xss.num_dim(), 2);
-    assert_eq!(xss.shape(), &[2, 2]);
-    assert_eq!(yss.num_dim(), 2);
-    assert_eq!(yss.shape(), &[2, 2]);
-
-    let r = &xss + &yss;
-
-    // Result is
-    //  [[ 11, 12]
-    //   [ 15, 16]]
-    let result: Vec<u32> = r.decrypt(&cks);
-    assert_eq!(result, vec![11, 12, 15, 16]);
-
-    // Clear 2D array with values
-    //  [[  10,  20]
-    //   [  30,  40]]
-    let clear_array = ClearArray::new(vec![10u32, 20u32, 30u32, 40u32], vec![2, 2]);
-    let r = &xss + &clear_array;
-
-    // Result is
-    //  [[ 20, 31]
-    //   [ 44, 55]]
-    let r: Vec<u32> = r.decrypt(&cks);
-    assert_eq!(r, vec![20, 31, 44, 55]);
-}
-```
--- a/tfhe/docs/getting_started/benchmarks/cpu/cpu_integer_operations.md
+++ b/tfhe/docs/getting_started/benchmarks/cpu/cpu_integer_operations.md
@@ -32,7 +32,7 @@ The next table shows the operation timings on CPU when the left input is encrypt

 All timings are based on parallelized Radix-based integer operations where each block is encrypted using the default parameters `PARAM_MESSAGE_2_CARRY_2_KS_PBS`. To ensure predictable timings, we perform operations in the `default` mode, which ensures that the input and output encoding are similar (i.e., the carries are always emptied).

-You can minimize operational costs by selecting from 'unchecked', 'checked', or 'smart' modes from [the fine-grained APIs](../../../references/fine-grained-apis/quick_start.md), each balancing performance and correctness differently. For more details about parameters, see [here](../../../references/fine-grained-apis/shortint/parameters.md). You can find the benchmark results on GPU for all these operations [here](../../../configuration/run_on_gpu.md#benchmark).
+You can minimize operational costs by selecting from 'unchecked', 'checked', or 'smart' modes from [the fine-grained APIs](../../../references/fine-grained-apis/quick_start.md), each balancing performance and correctness differently. For more details about parameters, see [here](../../../references/fine-grained-apis/shortint/parameters.md). You can find the benchmark results on GPU for all these operations [here](../../../configuration/gpu_acceleration/benchmark.md).

 ## Reproducing TFHE-rs benchmarks

--- a/tfhe/src/test_user_docs.rs
+++ b/tfhe/src/test_user_docs.rs
@@ -216,11 +216,27 @@ mod test_gpu_doc {
    use doc_comment::doctest;

    doctest!(
-        "../docs/configuration/run_on_gpu.md",
-        configuration_run_on_gpu
+        "../docs/configuration/gpu_acceleration/run_on_gpu.md",
+        configuration_gpu_acceleration_run_on_gpu
    );
    doctest!(
-        "../docs/tutorials/multi_gpu_device_selection.md",
-        tutorials_multi_gpu_device_selection
+        "../docs/configuration/gpu_acceleration/gpu_operations.md",
+        configuration_gpu_acceleration_gpu_operations
+    );
+    doctest!(
+        "../docs/configuration/gpu_acceleration/compressing_ciphertexts.md",
+        configuration_gpu_acceleration_compressing_ciphertexts
+    );
+    doctest!(
+        "../docs/configuration/gpu_acceleration/array_type.md",
+        configuration_gpu_acceleration_array_type
+    );
+    doctest!(
+        "../docs/configuration/gpu_acceleration/benchmark.md",
+        configuration_gpu_acceleration_benchmark
+    );
+    doctest!(
+        "../docs/configuration/gpu_acceleration/multi_gpu.md",
+        configuration_gpu_acceleration_multi_gpu_device_selection
    );
 }