Compare commits

..

13 Commits

Author SHA1 Message Date
Guillermo Oyarzun
f3f05f9068 refactor(gpu): random test fft 2024-06-27 10:10:13 +02:00
Agnes Leroy
3a2bb4470f fix(gpu): fix gpu index in casts, scalar comparison, scalar mul, etc. 2024-06-27 10:08:11 +02:00
Beka Barbakadze
6120fab886 feat(gpu): Implement propagate_single_carry_get_input_carries 2024-06-26 17:34:28 +02:00
Agnes Leroy
53b68619b0 chore(gpu): call nvidia-smi before launching tests on hyperstack 2024-06-26 16:47:29 +02:00
Guillermo Oyarzun
e854823233 refactor(gpu): speedup twiddles reads 2024-06-26 11:30:05 +02:00
sarah el kazdadi
19e00c484b feat(zk): zk perf improvements 2024-06-26 11:24:11 +02:00
David Testé
818e480dac chore(ci): publish only one tag for npm packages
NPM doesn't accept tags that are similar to a semantic-version
compatible string (e.g 0.7.0 or v0.7). We only publish "latest"
tag on release manager discretion.
2024-06-26 09:06:26 +02:00
David Testé
a7fc8a90e1 chore(ci): run build workflow on large windows instance 2024-06-25 18:17:26 +02:00
David Testé
3fad6d194c chore(ci): avoid cancel ongoing benchmarks on main branch 2024-06-25 17:46:24 +02:00
David Testé
23efcb8dd4 chore(bench): fix benchmark naming format for shortint 2024-06-25 17:46:07 +02:00
David Testé
33c69d9d1f chore(ci): update slab-github-runner action 2024-06-25 12:00:12 +02:00
David Testé
960d287e92 chore(bench): fix display name for gpu unsigned integer operations 2024-06-25 11:59:08 +02:00
Nicolas Sarlin
662e5402a3 chore(doc): add missing doc for a data breaking change 2024-06-24 16:09:26 +02:00
44 changed files with 1287 additions and 605 deletions

View File

@@ -3,6 +3,7 @@ self-hosted-runner:
labels:
- m1mac
- 4090-desktop
- large_windows_16_latest
# Configuration variables in array of strings defined in your repository or
# organization. `null` means disabling configuration variables check.
# Empty array means no configuration variable is allowed.

View File

@@ -26,7 +26,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -140,7 +140,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -26,7 +26,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -187,7 +187,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -30,7 +30,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -93,7 +93,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -26,7 +26,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -117,7 +117,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -30,7 +30,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -97,7 +97,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -129,7 +129,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -235,7 +235,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -27,7 +27,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -90,7 +90,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -19,7 +19,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest-large, windows-latest]
os: [ubuntu-latest, macos-latest-large, large_windows_16_latest]
fail-fast: false
steps:

View File

@@ -25,7 +25,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -125,7 +125,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -27,7 +27,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -175,7 +175,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -27,7 +27,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -78,7 +78,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -27,7 +27,7 @@ jobs:
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}_cuda_integer_bench
cancel-in-progress: true
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ["self-hosted", "4090-desktop"]
timeout-minutes: 1440 # 24 hours
strategy:
@@ -114,7 +114,7 @@ jobs:
needs: cuda-integer-benchmarks
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}_cuda_core_crypto_bench
cancel-in-progress: true
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ["self-hosted", "4090-desktop"]
timeout-minutes: 1440 # 24 hours

View File

@@ -26,7 +26,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -106,6 +106,10 @@ jobs:
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name:
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run core crypto, integer and internal CUDA backend tests
run: |
make test_gpu
@@ -143,7 +147,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -30,7 +30,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -187,7 +187,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -29,7 +29,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -180,7 +180,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -30,7 +30,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -189,7 +189,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -28,7 +28,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -164,7 +164,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -29,7 +29,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -168,7 +168,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -40,8 +40,9 @@ jobs:
fetch-depth: 0
- name: Create NPM version tag
if: ${{ inputs.npm_latest_tag }}
run: |
echo "NPM_TAG=$(sed -n -e '1,/^version/p' tfhe/Cargo.toml | grep '^version[[:space:]]*=' | cut -d '=' -f 2 | tr -d ' ')" >> "${GITHUB_ENV}"
echo "NPM_TAG=latest" >> "${GITHUB_ENV}"
- name: Publish crate.io package
if: ${{ inputs.push_to_crates }}
@@ -65,14 +66,6 @@ jobs:
dry-run: ${{ inputs.dry_run }}
tag: ${{ env.NPM_TAG }}
- name: Publish web package as latest
if: ${{ inputs.push_web_package && inputs.npm_latest_tag }}
uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
with:
token: ${{ secrets.NPM_TOKEN }}
package: tfhe/pkg/package.json
dry-run: ${{ inputs.dry_run }}
- name: Build Node package
if: ${{ inputs.push_node_package }}
run: |
@@ -90,14 +83,6 @@ jobs:
dry-run: ${{ inputs.dry_run }}
tag: ${{ env.NPM_TAG }}
- name: Publish Node package as latest
if: ${{ inputs.push_node_package && inputs.npm_latest_tag }}
uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
with:
token: ${{ secrets.NPM_TOKEN }}
package: tfhe/pkg/package.json
dry-run: ${{ inputs.dry_run }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true

View File

@@ -29,7 +29,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -62,7 +62,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -171,7 +171,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -65,7 +65,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -82,7 +82,7 @@ jobs:
needs: [ should-run, setup-instance ]
concurrency:
group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
cancel-in-progress: true
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
@@ -182,7 +182,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -272,6 +272,11 @@ void cuda_propagate_single_carry_kb_64_inplace(
void *carry_out, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t num_blocks);
void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
void *carry_out, void *input_carries, int8_t *mem_ptr, void **bsks,
void **ksks, uint32_t num_blocks);
void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);

View File

@@ -294,9 +294,6 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
__syncthreads();
}
// compressed size = 8192 is actual polynomial size = 16384.
// from this size, twiddles can't fit in constant memory,
// so from here, butterfly operation access device memory.
if constexpr (params::degree >= 8192) {
// level 13
tid = threadIdx.x;
@@ -307,7 +304,7 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
(tid & (params::degree / 8192 - 1));
i2 = i1 + params::degree / 8192;
w = negtwiddles13[twid_id];
w = negtwiddles[twid_id + 4096];
u = A[i1];
v = A[i2] * w;
@@ -351,10 +348,6 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
// mapping in backward fft is reversed
// butterfly operation is started from last level
// compressed size = 8192 is actual polynomial size = 16384.
// twiddles for this size can't fit in constant memory so
// butterfly operation for this level access device memory to fetch
// twiddles
if constexpr (params::degree >= 8192) {
// level 13
tid = threadIdx.x;
@@ -365,7 +358,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
(tid & (params::degree / 8192 - 1));
i2 = i1 + params::degree / 8192;
w = negtwiddles13[twid_id];
w = negtwiddles[twid_id + 4096];
u = A[i1] - A[i2];
A[i1] += A[i2];
@@ -722,4 +715,312 @@ __global__ void batch_polynomial_mul(double2 *d_input1, double2 *d_input2,
}
}
template <class params> __device__ void NSMFFT_direct_bundle(double2 *A, const double2 regs[4]) {
/* We don't make bit reverse here, since twiddles are already reversed
* Each thread is always in charge of "opt/2" pairs of coefficients,
* which is why we always loop through N/2 by N/opt strides
* The pragma unroll instruction tells the compiler to unroll the
* full loop, which should increase performance
*/
size_t tid = threadIdx.x;
size_t twid_id;
size_t i1, i2;
double2 u, v, w;
// level 1
// we don't make actual complex multiplication on level1 since we have only
// one twiddle, it's real and image parts are equal, so we can multiply
// it with simpler operations
// degree = 1024, opt = 2 ->
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
i1 = tid;
i2 = tid + params::degree / 2;
//u = A[i1];
//v = A[i2] * (double2){0.707106781186547461715008466854,
// 0.707106781186547461715008466854};
u = regs[i];
v = regs[i + params::opt / 2] * (double2){0.707106781186547461715008466854,
0.707106781186547461715008466854};
//A[i1] += v;
A[i1] = u + v;
A[i2] = u - v;
tid += params::degree / params::opt; //256
}
__syncthreads();
// level 2
// from this level there are more than one twiddles and none of them has equal
// real and imag parts, so complete complex multiplication is needed
// for each level params::degree / 2^level represents number of coefficients
// inside divided chunk of specific level
//
#pragma unroll
for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
tid = threadIdx.x + i * params::degree / params::opt;
twid_id = tid / (params::degree / 4);
i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
i2 = i1 + params::degree / 4;
w = negtwiddles[twid_id + 2];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
}
__syncthreads();
// level 3
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8);
i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
i2 = i1 + params::degree / 8;
w = negtwiddles[twid_id + 4];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 4
//tid = threadIdx.x;
//for (size_t i = 0; i < params::opt / 2; ++i) {
#pragma unroll
for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
tid = threadIdx.x + i * params::degree / params::opt;
twid_id = tid / (params::degree / 16);
i1 =
2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
i2 = i1 + params::degree / 16;
w = negtwiddles[twid_id + 8];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
//tid += params::degree / params::opt;
}
__syncthreads();
// level 5
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 32);
i1 =
2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
i2 = i1 + params::degree / 32;
w = negtwiddles[twid_id + 16];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 6
//tid = threadIdx.x;
//for (size_t i = 0; i < params::opt / 2; ++i) {
#pragma unroll
for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
tid = threadIdx.x + i * params::degree / params::opt;
twid_id = tid / (params::degree / 64);
i1 =
2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
i2 = i1 + params::degree / 64;
w = negtwiddles[twid_id + 32];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
//tid += params::degree / params::opt;
}
__syncthreads();
// level 7
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 128);
i1 = 2 * (params::degree / 128) * twid_id +
(tid & (params::degree / 128 - 1));
i2 = i1 + params::degree / 128;
w = negtwiddles[twid_id + 64];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// from level 8, we need to check size of params degree, because we support
// minimum actual polynomial size = 256, when compressed size is halfed and
// minimum supported compressed size is 128, so we always need first 7
// levels of butterfly operation, since butterfly levels are hardcoded
// we need to check if polynomial size is big enough to require specific level
// of butterfly.
if constexpr (params::degree >= 256) {
// level 8
//tid = threadIdx.x;
//for (size_t i = 0; i < params::opt / 2; ++i) {
#pragma unroll
for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
tid = threadIdx.x + i * params::degree / params::opt;
twid_id = tid / (params::degree / 256);
i1 = 2 * (params::degree / 256) * twid_id +
(tid & (params::degree / 256 - 1));
i2 = i1 + params::degree / 256;
w = negtwiddles[twid_id + 128];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
//tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 512) {
// level 9
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 512);
i1 = 2 * (params::degree / 512) * twid_id +
(tid & (params::degree / 512 - 1));
i2 = i1 + params::degree / 512;
w = negtwiddles[twid_id + 256];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 1024) {
// level 10
//tid = threadIdx.x;
//for (size_t i = 0; i < params::opt / 2; ++i) {
#pragma unroll
for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
tid = threadIdx.x + i * params::degree / params::opt;
twid_id = tid / (params::degree / 1024);
i1 = 2 * (params::degree / 1024) * twid_id +
(tid & (params::degree / 1024 - 1));
i2 = i1 + params::degree / 1024;
w = negtwiddles[twid_id + 512];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
//tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 2048) {
// level 11
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 2048);
i1 = 2 * (params::degree / 2048) * twid_id +
(tid & (params::degree / 2048 - 1));
i2 = i1 + params::degree / 2048;
w = negtwiddles[twid_id + 1024];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 4096) {
// level 12
//tid = threadIdx.x;
//for (size_t i = 0; i < params::opt / 2; ++i) {
#pragma unroll
for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
tid = threadIdx.x + i * params::degree / params::opt;
twid_id = tid / (params::degree / 4096);
i1 = 2 * (params::degree / 4096) * twid_id +
(tid & (params::degree / 4096 - 1));
i2 = i1 + params::degree / 4096;
w = negtwiddles[twid_id + 2048];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
//tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 8192) {
// level 13
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8192);
i1 = 2 * (params::degree / 8192) * twid_id +
(tid & (params::degree / 8192 - 1));
i2 = i1 + params::degree / 8192;
w = negtwiddles[twid_id + 4096];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
}
#endif // GPU_BOOTSTRAP_FFT_CUH

View File

@@ -1,6 +1,6 @@
#include "cuComplex.h"
__constant__ double2 negtwiddles[4096] = {
__device__ double2 negtwiddles[8192] = {
{0, 0},
{0.707106781186547461715008466854, 0.707106781186547572737310929369},
{0.92387953251128673848313610506, 0.382683432365089781779232680492},
@@ -4096,9 +4096,7 @@ __constant__ double2 negtwiddles[4096] = {
{0.70791982920081630847874976098, 0.706292797233758484765075991163},
{-0.706292797233758484765075991163, 0.70791982920081630847874976098},
{0.00115048533711384847431913325266, 0.99999933819152553304832053982},
{-0.99999933819152553304832053982, 0.00115048533711384847431913325266}};
__device__ double2 negtwiddles13[4096] = {
{-0.99999933819152553304832053982, 0.00115048533711384847431913325266},
{0.999999981616429334252416083473, 0.000191747597310703291528452552051},
{-0.000191747597310703291528452552051, 0.999999981616429334252416083473},
{0.706971182161065359039753275283, 0.707242354213734603085583785287},

View File

@@ -2,12 +2,7 @@
#define GPU_BOOTSTRAP_TWIDDLES_CUH
/*
* 'negtwiddles' are stored in constant memory for faster access times
* because of it's limited size, only twiddles for up to 2^12 polynomial size
* can be stored there, twiddles for 2^13 are stored in device memory
* 'negtwiddles13'
* 'negtwiddles' are stored in device memory to profit caching
*/
extern __constant__ double2 negtwiddles[4096];
extern __device__ double2 negtwiddles13[4096];
extern __device__ double2 negtwiddles[8192];
#endif

View File

@@ -68,6 +68,18 @@ void cuda_propagate_single_carry_kb_64_inplace(
host_propagate_single_carry<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
nullptr, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), num_blocks);
}
void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
void *carry_out, void *input_carries, int8_t *mem_ptr, void **bsks,
void **ksks, uint32_t num_blocks) {
host_propagate_single_carry<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
static_cast<uint64_t *>(input_carries),
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
num_blocks);
}

View File

@@ -427,7 +427,7 @@ void scratch_cuda_propagate_single_carry_kb_inplace(
template <typename Torus>
void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array,
Torus *carry_out,
Torus *carry_out, Torus *input_carries,
int_sc_prop_memory<Torus> *mem, void **bsks,
Torus **ksks, uint32_t num_blocks) {
auto params = mem->params;
@@ -482,6 +482,12 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
if (input_carries != nullptr) {
cuda_memcpy_async_gpu_to_gpu(input_carries, step_output,
big_lwe_size_bytes * num_blocks, streams[0],
gpu_indexes[0]);
}
host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, step_output,
glwe_dimension * polynomial_size, num_blocks);

View File

@@ -368,8 +368,8 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
num_blocks);
host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
radix_lwe_out, nullptr, mem_ptr->scp_mem,
bsks, ksks, num_blocks);
radix_lwe_out, nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks, num_blocks);
}
template <typename Torus, typename STorus, class params>

View File

@@ -84,6 +84,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
Torus *bsk_poly = bsk_slice + poly_id * params::degree;
// opt = 8 degree/opt = 256
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
bsk_poly, accumulator);
@@ -114,6 +115,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
// Move accumulator to local memory
double2 temp[params::opt / 2];
int tid = threadIdx.x;
//opt = 8 degree=2048 degree/opt =256
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
temp[i].x = __ll2double_rn((int64_t)accumulator[tid]);
@@ -123,17 +125,22 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
temp[i].y /= (double)std::numeric_limits<Torus>::max();
tid += params::degree / params::opt;
}
/*
synchronize_threads_in_block();
// Move from local memory back to shared memory but as complex
tid = threadIdx.x;
//Loop for 4 times ... temp[4]
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = temp[i];
tid += params::degree / params::opt;
tid += params::degree / params::opt; // degree 2048 opt 8 degree/opt = 256
}
synchronize_threads_in_block();
NSMFFT_direct<HalfDegree<params>>(fft);
*/
NSMFFT_direct_bundle<HalfDegree<params>>(fft, temp);
// lwe iteration
auto keybundle_out = get_ith_mask_kth_block(

View File

@@ -1068,6 +1068,19 @@ extern "C" {
num_blocks: u32,
);
pub fn cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
streams: *const *mut c_void,
gpu_indexes: *const u32,
gpu_count: u32,
radix_lwe: *mut c_void,
carry_out: *mut c_void,
input_carries: *mut c_void,
mem_ptr: *mut i8,
bsks: *const *mut c_void,
ksks: *const *mut c_void,
num_blocks: u32,
);
pub fn cleanup_cuda_propagate_single_carry(
streams: *const *mut c_void,
gpu_indexes: *const u32,

View File

@@ -13,9 +13,9 @@ description = "tfhe-zk-pok: An implementation of zero-knowledge proofs of encryp
[dependencies]
ark-bls12-381 = { package = "tfhe-ark-bls12-381", version = "0.4.0" }
ark-ec = { package = "tfhe-ark-ec", version = "0.4.2" }
ark-ff = { package = "tfhe-ark-ff", version = "0.4.3" }
ark-poly = { package = "tfhe-ark-poly", version = "0.4.2" }
ark-ec = { package = "tfhe-ark-ec", version = "0.4.2", features = ["parallel"] }
ark-ff = { package = "tfhe-ark-ff", version = "0.4.3", features = ["parallel"] }
ark-poly = { package = "tfhe-ark-poly", version = "0.4.2", features = ["parallel"] }
ark-serialize = { version = "0.4.2" }
rand = "0.8.5"
rayon = "1.8.0"

View File

@@ -242,6 +242,96 @@ mod g2 {
.unwrap(),
}
}
// m is an intermediate variable that's used in both the curve point addition and pairing
// functions. we cache it since it requires a Zp division
// https://hackmd.io/@tazAymRSQCGXTUKkbh1BAg/Sk27liTW9#Math-Formula-for-Point-Addition
pub(crate) fn compute_m(self, other: G2Affine) -> Option<crate::curve_446::Fq2> {
let zero = crate::curve_446::Fq2::ZERO;
// in the context of elliptic curves, the point at infinity is the zero element of the
// group
if self.inner.infinity || other.inner.infinity {
return None;
}
if self == other {
let x = self.inner.x;
let y = self.inner.y;
if y == zero {
None
} else {
let xx = x.square();
Some((xx.double() + xx) / y.double())
}
} else {
let x1 = self.inner.x;
let y1 = self.inner.y;
let x2 = other.inner.x;
let y2 = other.inner.y;
let x_delta = x2 - x1;
let y_delta = y2 - y1;
if x_delta == zero {
None
} else {
Some(y_delta / x_delta)
}
}
}
pub(crate) fn double(self, m: Option<crate::curve_446::Fq2>) -> Self {
// in the context of elliptic curves, the point at infinity is the zero element of the
// group
if self.inner.infinity {
return self;
}
let mut result = self;
let x = self.inner.x;
let y = self.inner.y;
if let Some(m) = m {
let x3 = m.square() - x.double();
let y3 = m * (x - x3) - y;
(result.inner.x, result.inner.y) = (x3, y3);
} else {
result.inner.infinity = true;
}
result
}
pub(crate) fn add_unequal(self, other: G2Affine, m: Option<crate::curve_446::Fq2>) -> Self {
// in the context of elliptic curves, the point at infinity is the zero element of the
// group
if self.inner.infinity {
return other;
}
if other.inner.infinity {
return self;
}
let mut result = self;
let x1 = self.inner.x;
let y1 = self.inner.y;
let x2 = other.inner.x;
if let Some(m) = m {
let x3 = m.square() - x1 - x2;
let y3 = m * (x1 - x3) - y1;
(result.inner.x, result.inner.y) = (x3, y3);
} else {
result.inner.infinity = true;
}
result
}
}
#[derive(
@@ -373,9 +463,9 @@ mod g2 {
}
pub fn double(self) -> Self {
Self {
inner: self.inner.double(),
}
let mut this = self;
this.inner.double_in_place();
this
}
}
@@ -431,51 +521,79 @@ mod g2 {
}
mod gt {
use crate::curve_446::{Fq, Fq12, Fq2};
use super::*;
use ark_ec::bls12::Bls12Config;
use ark_ec::pairing::{MillerLoopOutput, Pairing};
use ark_ff::{CubicExtField, Fp12, Fp2, QuadExtField};
use ark_ff::{CubicExtField, QuadExtField};
type Bls = crate::curve_446::Bls12_446;
type Config = crate::curve_446::Config;
const ONE: Fp2<<Config as Bls12Config>::Fp2Config> = QuadExtField {
c0: MontFp!("1"),
c1: MontFp!("0"),
};
const ZERO: Fp2<<Config as Bls12Config>::Fp2Config> = QuadExtField {
const ZERO: Fq2 = QuadExtField {
c0: MontFp!("0"),
c1: MontFp!("0"),
};
const U1: Fp12<<Config as Bls12Config>::Fp12Config> = QuadExtField {
c0: CubicExtField {
c0: ZERO,
c1: ZERO,
c2: ZERO,
},
c1: CubicExtField {
c0: ONE,
c1: ZERO,
c2: ZERO,
},
// computed by copying the result from
// let two: Fq = MontFp!("2"); println!("{}", two.inverse().unwrap()), which we can't compute in
// a const context;
const TWO_INV: Fq = {
MontFp!("86412351771428577990035638289747981121746346761394949218917418178192828331138736448451251370148591845087981000773214233672031082665302")
};
const U3: Fp12<<Config as Bls12Config>::Fp12Config> = QuadExtField {
c0: CubicExtField {
c0: ZERO,
c1: ZERO,
c2: ZERO,
},
c1: CubicExtField {
c0: ZERO,
c1: ONE,
c2: ZERO,
},
const TWO_INV_MINUS_1: Fq = {
MontFp!("86412351771428577990035638289747981121746346761394949218917418178192828331138736448451251370148591845087981000773214233672031082665301")
};
const fn fp2_to_fp12(
x: Fp2<<Config as Bls12Config>::Fp2Config>,
) -> Fp12<<Config as Bls12Config>::Fp12Config> {
// the only non zero value in inv(U1) and inv(U3), which come from Olivier's equations.
const C: Fq2 = QuadExtField {
c0: TWO_INV,
c1: TWO_INV_MINUS_1,
};
fn fp2_mul_c(x: Fq2) -> Fq2 {
let x0_c0 = x.c0 * C.c0;
let x1_c0 = x.c1 * C.c0;
let x0_c1 = x0_c0 - x.c0;
let x1_c1 = x1_c0 - x.c1;
QuadExtField {
c0: x0_c0 - x1_c1,
c1: x0_c1 + x1_c0,
}
}
fn fp2_mul_u1_inv(x: Fq2) -> Fq12 {
QuadExtField {
c0: CubicExtField {
c0: ZERO,
c1: ZERO,
c2: ZERO,
},
c1: CubicExtField {
c0: ZERO,
c1: ZERO,
c2: fp2_mul_c(x),
},
}
}
fn fp2_mul_u3_inv(x: Fq2) -> Fq12 {
QuadExtField {
c0: CubicExtField {
c0: ZERO,
c1: ZERO,
c2: ZERO,
},
c1: CubicExtField {
c0: ZERO,
c1: fp2_mul_c(x),
c2: ZERO,
},
}
}
const fn fp2_to_fp12(x: Fq2) -> Fq12 {
QuadExtField {
c0: CubicExtField {
c0: x,
@@ -490,52 +608,59 @@ mod gt {
}
}
const fn fp_to_fp12(
x: <Config as Bls12Config>::Fp,
) -> Fp12<<Config as Bls12Config>::Fp12Config> {
fp2_to_fp12(QuadExtField {
const fn fp_to_fp2(x: Fq) -> Fq2 {
QuadExtField {
c0: x,
c1: MontFp!("0"),
})
}
}
fn ate_tangent_ev(qt: G2, evpt: G1) -> Fp12<<Config as Bls12Config>::Fp12Config> {
let qt = qt.inner.into_affine();
let evpt = evpt.inner.into_affine();
const fn fp_to_fp12(x: Fq) -> Fq12 {
fp2_to_fp12(fp_to_fp2(x))
}
fn ate_tangent_ev(qt: G2Affine, evpt: G1Affine, m: Fq2) -> Fq12 {
let qt = qt.inner;
let evpt = evpt.inner;
let (xt, yt) = (qt.x, qt.y);
let (xe, ye) = (evpt.x, evpt.y);
let xt = fp2_to_fp12(xt);
let yt = fp2_to_fp12(yt);
let xe = fp_to_fp12(xe);
let ye = fp_to_fp12(ye);
let l = m;
let mut l_xe = l;
l_xe.c0 *= xe;
l_xe.c1 *= xe;
let three = fp_to_fp12(MontFp!("3"));
let two = fp_to_fp12(MontFp!("2"));
let mut r0 = fp_to_fp12(ye);
let r1 = fp2_mul_u1_inv(l_xe);
let r2 = fp2_mul_u3_inv(l * xt - yt);
let l = three * xt.square() / (two * yt);
ye - (l * xe) / U1 + (l * xt - yt) / U3
r0.c1.c1 = r2.c1.c1;
r0.c1.c2 = -r1.c1.c2;
r0
}
fn ate_line_ev(q1: G2, q2: G2, evpt: G1) -> Fp12<<Config as Bls12Config>::Fp12Config> {
let q1 = q1.inner.into_affine();
let q2 = q2.inner.into_affine();
let evpt = evpt.inner.into_affine();
fn ate_line_ev(q1: G2Affine, evpt: G1Affine, m: Fq2) -> Fq12 {
let q1 = q1.inner;
let evpt = evpt.inner;
let (x1, y1) = (q1.x, q1.y);
let (x2, y2) = (q2.x, q2.y);
let (xe, ye) = (evpt.x, evpt.y);
let x1 = fp2_to_fp12(x1);
let y1 = fp2_to_fp12(y1);
let x2 = fp2_to_fp12(x2);
let y2 = fp2_to_fp12(y2);
let xe = fp_to_fp12(xe);
let ye = fp_to_fp12(ye);
let l = m;
let mut l_xe = l;
l_xe.c0 *= xe;
l_xe.c1 *= xe;
let l = (y2 - y1) / (x2 - x1);
ye - (l * xe) / U1 + (l * x1 - y1) / U3
let mut r0 = fp_to_fp12(ye);
let r1 = fp2_mul_u1_inv(l * fp_to_fp2(xe));
let r2 = fp2_mul_u3_inv(l * x1 - y1);
r0.c1.c1 = r2.c1.c1;
r0.c1.c2 = -r1.c1.c2;
r0
}
#[allow(clippy::needless_range_loop)]
@@ -544,22 +669,24 @@ mod gt {
let t_bits = b"110000000001000001000000100000000000000000000000000000000100000000000000001";
let mut fk = fp_to_fp12(MontFp!("1"));
let p = p.normalize();
let q = q.normalize();
let mut qk = q;
for k in 1..t_log2 {
let lkk = ate_tangent_ev(qk, p);
qk = qk + qk;
let m = qk.compute_m(qk).unwrap();
let lkk = ate_tangent_ev(qk, p, m);
qk = qk.double(Some(m));
fk = fk.square() * lkk;
if t_bits[k] == b'1' {
assert_ne!(q, qk);
let lkp1 = if q != -qk {
ate_line_ev(q, qk, p)
} else {
fp_to_fp12(MontFp!("1"))
};
qk += q;
fk *= lkp1;
let m = q.compute_m(qk);
let new_qk = q.add_unequal(qk, m);
if !new_qk.inner.infinity {
fk *= ate_line_ev(q, p, m.unwrap());
}
qk = new_qk;
}
}
let mlo = MillerLoopOutput(fk);

View File

@@ -1720,7 +1720,7 @@ mod cuda {
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unchecked_unsigned_overflowing_scalar_add,
display_name: unsigned_overflowing_scalar_add,
display_name: unsigned_overflowing_add,
rng_func: default_scalar
);
@@ -1981,7 +1981,7 @@ mod cuda {
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unsigned_overflowing_scalar_add,
display_name: overflowing_scalar_add,
display_name: overflowing_add,
rng_func: default_scalar
);

View File

@@ -553,7 +553,7 @@ define_server_key_bench_fn!(
);
define_server_key_bench_fn!(
method_name: greater,
display_name: greater,
display_name: greater_than,
BenchParamsSet::Standard
);
define_server_key_bench_fn!(
@@ -563,7 +563,7 @@ define_server_key_bench_fn!(
);
define_server_key_bench_fn!(
method_name: less,
display_name: less,
display_name: less_than,
BenchParamsSet::Standard
);
define_server_key_bench_fn!(
@@ -676,7 +676,7 @@ define_server_key_scalar_div_bench_fn!(
);
define_server_key_scalar_bench_fn!(
method_name: scalar_greater,
display_name: greater,
display_name: greater_than,
BenchParamsSet::Standard
);
define_server_key_scalar_bench_fn!(
@@ -686,7 +686,7 @@ define_server_key_scalar_bench_fn!(
);
define_server_key_scalar_bench_fn!(
method_name: scalar_less,
display_name: less,
display_name: less_than,
BenchParamsSet::Standard
);
define_server_key_scalar_bench_fn!(

View File

@@ -4,7 +4,7 @@ use rayon::prelude::*;
use tfhe::shortint::prelude::*;
pub fn pack_cast_64(c: &mut Criterion) {
let bench_name = "pack_cast_64";
let bench_name = "shortint::pack_cast_64";
let mut bench_group = c.benchmark_group(bench_name);
let (client_key_1, server_key_1): (ClientKey, ServerKey) =
@@ -55,7 +55,7 @@ pub fn pack_cast_64(c: &mut Criterion) {
}
pub fn pack_cast(c: &mut Criterion) {
let bench_name = "pack_cast";
let bench_name = "shortint::pack_cast";
let mut bench_group = c.benchmark_group(bench_name);
let (client_key_1, server_key_1): (ClientKey, ServerKey) =
@@ -96,7 +96,7 @@ pub fn pack_cast(c: &mut Criterion) {
}
pub fn cast(c: &mut Criterion) {
let bench_name = "cast";
let bench_name = "shortint::cast";
let mut bench_group = c.benchmark_group(bench_name);
let (client_key_1, server_key_1): (ClientKey, ServerKey) =

View File

@@ -2,7 +2,7 @@
This document explains how to save and load versioned data using the data versioning feature.
Starting from v0.7.0, **TFHE-rs** supports versioned data types. This allows you to store data and load it in the future without compatibility concerns. This feature is done by the `tfhe-versionable` crate.
Starting from v0.6.4, **TFHE-rs** supports versioned data types. This allows you to store data and load it in the future without compatibility concerns. This feature is done by the `tfhe-versionable` crate.
This versioning scheme is compatible with all the [data formats](https://serde.rs/#data-formats) supported by serde.
@@ -85,7 +85,7 @@ When possible, data will be upgraded automatically without any kind of interract
You will find below a list of breaking changes and how to upgrade them.
# 0.6 -> 0.7
- `crate::integer::ciphertext::CompactCiphertextList`
- `tfhe::integer::ciphertext::CompactCiphertextList`:
in 0.6, these lists of ciphertext were statically typed and homogenous. Since 0.7, they are heterogeneous. The new version stores for each element an information about its type (Signed, Unsigned or Boolean). Since this information were not stored before, the list is set to be made of `Unsigned` integers by default. If that is not the case, you can set its type using the following snippet:
```rust
@@ -142,3 +142,12 @@ pub fn main() {
assert_eq!(-1i8, decrypted);
}
```
- `tfhe::{CompactFheInt, CompactFheUint, CompactFheIntList, CompactFheUintList}`:
The types have been deprecated, they are only kept in **TFHE-rs** for backward compatibility. They can now be accessed using the `tfhe::high_level_api::backward_compatibility::integers` module. The only functionality that is still supported is to unversionize them and expand them into regular `FheInt`, `FheUint`, `Vec<FehInt>` and `Vec<FheUint>`:
```Rust
let loaded_ct = CompactFheUint8::unversionize(versioned_ct).unwrap();
let ct = loaded_ct.expand();
```
Starting with v0.7, this compact list functionality is provided by the `tfhe::CompactCiphertextList` type.

View File

@@ -933,6 +933,91 @@ pub unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric
);
}
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function
/// as soon as synchronization is required
pub unsafe fn propagate_single_carry_get_input_carries_assign_async<
T: UnsignedInteger,
B: Numeric,
>(
streams: &CudaStreams,
radix_lwe_input: &mut CudaVec<T>,
carry_out: &mut CudaVec<T>,
input_carries: &mut CudaVec<T>,
bootstrapping_key: &CudaVec<B>,
keyswitch_key: &CudaVec<T>,
lwe_dimension: LweDimension,
glwe_dimension: GlweDimension,
polynomial_size: PolynomialSize,
ks_level: DecompositionLevelCount,
ks_base_log: DecompositionBaseLog,
pbs_level: DecompositionLevelCount,
pbs_base_log: DecompositionBaseLog,
num_blocks: u32,
message_modulus: MessageModulus,
carry_modulus: CarryModulus,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
) {
assert_eq!(
streams.gpu_indexes[0],
radix_lwe_input.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
assert_eq!(
streams.gpu_indexes[0],
bootstrapping_key.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
assert_eq!(
streams.gpu_indexes[0],
keyswitch_key.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
let big_lwe_dimension: u32 = glwe_dimension.0 as u32 * polynomial_size.0 as u32;
scratch_cuda_propagate_single_carry_kb_64_inplace(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
std::ptr::addr_of_mut!(mem_ptr),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
big_lwe_dimension,
lwe_dimension.0 as u32,
ks_level.0 as u32,
ks_base_log.0 as u32,
pbs_level.0 as u32,
pbs_base_log.0 as u32,
grouping_factor.0 as u32,
num_blocks,
message_modulus.0 as u32,
carry_modulus.0 as u32,
pbs_type as u32,
true,
);
cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
radix_lwe_input.as_mut_c_ptr(0),
carry_out.as_mut_c_ptr(0),
input_carries.as_mut_c_ptr(0),
mem_ptr,
bootstrapping_key.ptr.as_ptr(),
keyswitch_key.ptr.as_ptr(),
num_blocks,
);
cleanup_cuda_propagate_single_carry(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
std::ptr::addr_of_mut!(mem_ptr),
);
}
#[allow(clippy::too_many_arguments)]
/// # Safety
///

View File

@@ -29,24 +29,24 @@ impl CudaServerKey {
/// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
///
/// let gpu_index = 0;
/// let mut stream = CudaStreams::new_single_gpu(gpu_index);
/// let mut streams = CudaStreams::new_single_gpu(gpu_index);
///
/// // We have 4 * 2 = 8 bits of message
/// let size = 4;
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
///
/// let msg = 1u64;
///
/// let ct = cks.encrypt(msg);
///
/// // Copy to GPU
/// let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut stream);
/// let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut streams);
///
/// // Compute homomorphically a bitwise and:
/// let d_ct_res = sks.unchecked_bitnot(&d_ct, &mut stream);
/// let d_ct_res = sks.unchecked_bitnot(&d_ct, &mut streams);
///
/// // Copy back to CPU
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
///
/// // Decrypt:
/// let dec: u64 = cks.decrypt(&ct_res);
@@ -55,24 +55,24 @@ impl CudaServerKey {
pub fn unchecked_bitnot<T: CudaIntegerRadixCiphertext>(
&self,
ct: &T,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T {
let mut result = unsafe { ct.duplicate_async(stream) };
self.unchecked_bitnot_assign(&mut result, stream);
let mut result = unsafe { ct.duplicate_async(streams) };
self.unchecked_bitnot_assign(&mut result, streams);
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn unchecked_bitnot_assign_async<T: CudaIntegerRadixCiphertext>(
&self,
ct: &mut T,
stream: &CudaStreams,
streams: &CudaStreams,
) {
// We do (-ciphertext) + (msg_mod -1) as it allows to avoid an allocation
cuda_lwe_ciphertext_negate_assign(&mut ct.as_mut().d_blocks, stream);
cuda_lwe_ciphertext_negate_assign(&mut ct.as_mut().d_blocks, streams);
let ct_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0;
@@ -81,14 +81,21 @@ impl CudaServerKey {
let shift_plaintext = u64::from(scalar) * delta;
let scalar_vector = vec![shift_plaintext; ct_blocks];
let mut d_decomposed_scalar =
CudaVec::<u64>::new_async(ct.as_ref().d_blocks.lwe_ciphertext_count().0, stream, 0);
d_decomposed_scalar.copy_from_cpu_async(scalar_vector.as_slice(), stream, 0);
let mut d_decomposed_scalar = CudaVec::<u64>::new_async(
ct.as_ref().d_blocks.lwe_ciphertext_count().0,
streams,
streams.gpu_indexes[0],
);
d_decomposed_scalar.copy_from_cpu_async(
scalar_vector.as_slice(),
streams,
streams.gpu_indexes[0],
);
cuda_lwe_ciphertext_plaintext_add_assign(
&mut ct.as_mut().d_blocks,
&d_decomposed_scalar,
stream,
streams,
);
ct.as_mut().info = ct.as_ref().info.after_bitnot();
}
@@ -96,12 +103,12 @@ impl CudaServerKey {
pub fn unchecked_bitnot_assign<T: CudaIntegerRadixCiphertext>(
&self,
ct: &mut T,
stream: &CudaStreams,
streams: &CudaStreams,
) {
unsafe {
self.unchecked_bitnot_assign_async(ct, stream);
self.unchecked_bitnot_assign_async(ct, streams);
}
stream.synchronize();
streams.synchronize();
}
/// Computes homomorphically bitand between two ciphertexts encrypting integer values.
@@ -121,11 +128,11 @@ impl CudaServerKey {
/// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
///
/// let gpu_index = 0;
/// let mut stream = CudaStreams::new_single_gpu(gpu_index);
/// let mut streams = CudaStreams::new_single_gpu(gpu_index);
///
/// // We have 4 * 2 = 8 bits of message
/// let size = 4;
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
///
/// let msg1 = 201u64;
/// let msg2 = 1u64;
@@ -134,14 +141,14 @@ impl CudaServerKey {
/// let ct2 = cks.encrypt(msg2);
///
/// // Copy to GPU
/// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream);
/// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream);
/// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams);
/// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams);
///
/// // Compute homomorphically a bitwise and:
/// let d_ct_res = sks.unchecked_bitand(&d_ct1, &d_ct2, &mut stream);
/// let d_ct_res = sks.unchecked_bitand(&d_ct1, &d_ct2, &mut streams);
///
/// // Copy back to CPU
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
///
/// // Decrypt:
/// let dec: u64 = cks.decrypt(&ct_res);
@@ -151,23 +158,23 @@ impl CudaServerKey {
&self,
ct_left: &T,
ct_right: &T,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T {
let mut result = unsafe { ct_left.duplicate_async(stream) };
self.unchecked_bitand_assign(&mut result, ct_right, stream);
let mut result = unsafe { ct_left.duplicate_async(streams) };
self.unchecked_bitand_assign(&mut result, ct_right, streams);
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn unchecked_bitop_assign_async<T: CudaIntegerRadixCiphertext>(
&self,
ct_left: &mut T,
ct_right: &T,
op: BitOpType,
stream: &CudaStreams,
streams: &CudaStreams,
) {
assert_eq!(
ct_left.as_ref().d_blocks.lwe_dimension(),
@@ -183,7 +190,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_bitop_integer_radix_kb_assign_async(
stream,
streams,
&mut ct_left.as_mut().d_blocks.0.d_vec,
&ct_right.as_ref().d_blocks.0.d_vec,
&d_bsk.d_vec,
@@ -210,7 +217,7 @@ impl CudaServerKey {
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_bitop_integer_radix_kb_assign_async(
stream,
streams,
&mut ct_left.as_mut().d_blocks.0.d_vec,
&ct_right.as_ref().d_blocks.0.d_vec,
&d_multibit_bsk.d_vec,
@@ -242,13 +249,13 @@ impl CudaServerKey {
&self,
ct_left: &mut T,
ct_right: &T,
stream: &CudaStreams,
streams: &CudaStreams,
) {
unsafe {
self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::And, stream);
self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::And, streams);
ct_left.as_mut().info = ct_left.as_ref().info.after_bitand(&ct_right.as_ref().info);
}
stream.synchronize();
streams.synchronize();
}
/// Computes homomorphically bitor between two ciphertexts encrypting integer values.
@@ -268,11 +275,11 @@ impl CudaServerKey {
/// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
///
/// let gpu_index = 0;
/// let mut stream = CudaStreams::new_single_gpu(gpu_index);
/// let mut streams = CudaStreams::new_single_gpu(gpu_index);
///
/// // We have 4 * 2 = 8 bits of message
/// let size = 4;
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
///
/// let msg1 = 200u64;
/// let msg2 = 1u64;
@@ -281,14 +288,14 @@ impl CudaServerKey {
/// let ct2 = cks.encrypt(msg2);
///
/// // Copy to GPU
/// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream);
/// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream);
/// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams);
/// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams);
///
/// // Compute homomorphically a bitwise and:
/// let d_ct_res = sks.unchecked_bitor(&d_ct1, &d_ct2, &mut stream);
/// let d_ct_res = sks.unchecked_bitor(&d_ct1, &d_ct2, &mut streams);
///
/// // Copy back to CPU
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
///
/// // Decrypt:
/// let dec: u64 = cks.decrypt(&ct_res);
@@ -298,10 +305,10 @@ impl CudaServerKey {
&self,
ct_left: &T,
ct_right: &T,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T {
let mut result = unsafe { ct_left.duplicate_async(stream) };
self.unchecked_bitor_assign(&mut result, ct_right, stream);
let mut result = unsafe { ct_left.duplicate_async(streams) };
self.unchecked_bitor_assign(&mut result, ct_right, streams);
result
}
@@ -309,13 +316,13 @@ impl CudaServerKey {
&self,
ct_left: &mut T,
ct_right: &T,
stream: &CudaStreams,
streams: &CudaStreams,
) {
unsafe {
self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::Or, stream);
self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::Or, streams);
ct_left.as_mut().info = ct_left.as_ref().info.after_bitor(&ct_right.as_ref().info);
}
stream.synchronize();
streams.synchronize();
}
/// Computes homomorphically bitxor between two ciphertexts encrypting integer values.
@@ -335,11 +342,11 @@ impl CudaServerKey {
/// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
///
/// let gpu_index = 0;
/// let mut stream = CudaStreams::new_single_gpu(gpu_index);
/// let mut streams = CudaStreams::new_single_gpu(gpu_index);
///
/// // We have 4 * 2 = 8 bits of message
/// let size = 4;
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
///
/// let msg1 = 49;
/// let msg2 = 64;
@@ -348,14 +355,14 @@ impl CudaServerKey {
/// let ct2 = cks.encrypt(msg2);
///
/// // Copy to GPU
/// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream);
/// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream);
/// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams);
/// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams);
///
/// // Compute homomorphically a bitwise and:
/// let d_ct_res = sks.unchecked_bitxor(&d_ct1, &d_ct2, &mut stream);
/// let d_ct_res = sks.unchecked_bitxor(&d_ct1, &d_ct2, &mut streams);
///
/// // Copy back to CPU
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
///
/// // Decrypt:
/// let dec: u64 = cks.decrypt(&ct_res);
@@ -365,10 +372,10 @@ impl CudaServerKey {
&self,
ct_left: &T,
ct_right: &T,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T {
let mut result = unsafe { ct_left.duplicate_async(stream) };
self.unchecked_bitxor_assign(&mut result, ct_right, stream);
let mut result = unsafe { ct_left.duplicate_async(streams) };
self.unchecked_bitxor_assign(&mut result, ct_right, streams);
result
}
@@ -376,13 +383,13 @@ impl CudaServerKey {
&self,
ct_left: &mut T,
ct_right: &T,
stream: &CudaStreams,
streams: &CudaStreams,
) {
unsafe {
self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::Xor, stream);
self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::Xor, streams);
ct_left.as_mut().info = ct_left.as_ref().info.after_bitxor(&ct_right.as_ref().info);
}
stream.synchronize();
streams.synchronize();
}
/// Computes homomorphically bitand between two ciphertexts encrypting integer values.
@@ -402,11 +409,11 @@ impl CudaServerKey {
/// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
///
/// let gpu_index = 0;
/// let mut stream = CudaStreams::new_single_gpu(gpu_index);
/// let mut streams = CudaStreams::new_single_gpu(gpu_index);
///
/// // We have 4 * 2 = 8 bits of message
/// let size = 4;
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
///
/// let msg1 = 201u64;
/// let msg2 = 1u64;
@@ -415,14 +422,14 @@ impl CudaServerKey {
/// let ct2 = cks.encrypt(msg2);
///
/// // Copy to GPU
/// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream);
/// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream);
/// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams);
/// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams);
///
/// // Compute homomorphically a bitwise and:
/// let d_ct_res = sks.bitand(&d_ct1, &d_ct2, &mut stream);
/// let d_ct_res = sks.bitand(&d_ct1, &d_ct2, &mut streams);
///
/// // Copy back to CPU
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
///
/// // Decrypt:
/// let dec: u64 = cks.decrypt(&ct_res);
@@ -432,22 +439,22 @@ impl CudaServerKey {
&self,
ct_left: &T,
ct_right: &T,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T {
let mut result = unsafe { ct_left.duplicate_async(stream) };
self.bitand_assign(&mut result, ct_right, stream);
let mut result = unsafe { ct_left.duplicate_async(streams) };
self.bitand_assign(&mut result, ct_right, streams);
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn bitand_assign_async<T: CudaIntegerRadixCiphertext>(
&self,
ct_left: &mut T,
ct_right: &T,
stream: &CudaStreams,
streams: &CudaStreams,
) {
let mut tmp_rhs;
@@ -458,36 +465,36 @@ impl CudaServerKey {
) {
(true, true) => (ct_left, ct_right),
(true, false) => {
tmp_rhs = ct_right.duplicate_async(stream);
self.full_propagate_assign_async(&mut tmp_rhs, stream);
tmp_rhs = ct_right.duplicate_async(streams);
self.full_propagate_assign_async(&mut tmp_rhs, streams);
(ct_left, &tmp_rhs)
}
(false, true) => {
self.full_propagate_assign_async(ct_left, stream);
self.full_propagate_assign_async(ct_left, streams);
(ct_left, ct_right)
}
(false, false) => {
tmp_rhs = ct_right.duplicate_async(stream);
tmp_rhs = ct_right.duplicate_async(streams);
self.full_propagate_assign_async(ct_left, stream);
self.full_propagate_assign_async(&mut tmp_rhs, stream);
self.full_propagate_assign_async(ct_left, streams);
self.full_propagate_assign_async(&mut tmp_rhs, streams);
(ct_left, &tmp_rhs)
}
}
};
self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::And, stream);
self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::And, streams);
}
pub fn bitand_assign<T: CudaIntegerRadixCiphertext>(
&self,
ct_left: &mut T,
ct_right: &T,
stream: &CudaStreams,
streams: &CudaStreams,
) {
unsafe {
self.bitand_assign_async(ct_left, ct_right, stream);
self.bitand_assign_async(ct_left, ct_right, streams);
}
stream.synchronize();
streams.synchronize();
}
/// Computes homomorphically bitor between two ciphertexts encrypting integer values.
@@ -507,11 +514,11 @@ impl CudaServerKey {
/// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
///
/// let gpu_index = 0;
/// let mut stream = CudaStreams::new_single_gpu(gpu_index);
/// let mut streams = CudaStreams::new_single_gpu(gpu_index);
///
/// // We have 4 * 2 = 8 bits of message
/// let size = 4;
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
///
/// let msg1 = 201u64;
/// let msg2 = 1u64;
@@ -520,14 +527,14 @@ impl CudaServerKey {
/// let ct2 = cks.encrypt(msg2);
///
/// // Copy to GPU
/// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream);
/// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream);
/// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams);
/// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams);
///
/// // Compute homomorphically a bitwise and:
/// let d_ct_res = sks.bitor(&d_ct1, &d_ct2, &mut stream);
/// let d_ct_res = sks.bitor(&d_ct1, &d_ct2, &mut streams);
///
/// // Copy back to CPU
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
///
/// // Decrypt:
/// let dec: u64 = cks.decrypt(&ct_res);
@@ -537,22 +544,22 @@ impl CudaServerKey {
&self,
ct_left: &T,
ct_right: &T,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T {
let mut result = unsafe { ct_left.duplicate_async(stream) };
self.bitor_assign(&mut result, ct_right, stream);
let mut result = unsafe { ct_left.duplicate_async(streams) };
self.bitor_assign(&mut result, ct_right, streams);
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn bitor_assign_async<T: CudaIntegerRadixCiphertext>(
&self,
ct_left: &mut T,
ct_right: &T,
stream: &CudaStreams,
streams: &CudaStreams,
) {
let mut tmp_rhs;
@@ -562,36 +569,36 @@ impl CudaServerKey {
) {
(true, true) => (ct_left, ct_right),
(true, false) => {
tmp_rhs = ct_right.duplicate_async(stream);
self.full_propagate_assign_async(&mut tmp_rhs, stream);
tmp_rhs = ct_right.duplicate_async(streams);
self.full_propagate_assign_async(&mut tmp_rhs, streams);
(ct_left, &tmp_rhs)
}
(false, true) => {
self.full_propagate_assign_async(ct_left, stream);
self.full_propagate_assign_async(ct_left, streams);
(ct_left, ct_right)
}
(false, false) => {
tmp_rhs = ct_right.duplicate_async(stream);
tmp_rhs = ct_right.duplicate_async(streams);
self.full_propagate_assign_async(ct_left, stream);
self.full_propagate_assign_async(&mut tmp_rhs, stream);
self.full_propagate_assign_async(ct_left, streams);
self.full_propagate_assign_async(&mut tmp_rhs, streams);
(ct_left, &tmp_rhs)
}
};
self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::Or, stream);
self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::Or, streams);
}
pub fn bitor_assign<T: CudaIntegerRadixCiphertext>(
&self,
ct_left: &mut T,
ct_right: &T,
stream: &CudaStreams,
streams: &CudaStreams,
) {
unsafe {
self.bitor_assign_async(ct_left, ct_right, stream);
self.bitor_assign_async(ct_left, ct_right, streams);
}
stream.synchronize();
streams.synchronize();
}
/// Computes homomorphically bitxor between two ciphertexts encrypting integer values.
@@ -611,11 +618,11 @@ impl CudaServerKey {
/// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
///
/// let gpu_index = 0;
/// let mut stream = CudaStreams::new_single_gpu(gpu_index);
/// let mut streams = CudaStreams::new_single_gpu(gpu_index);
///
/// // We have 4 * 2 = 8 bits of message
/// let size = 4;
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
///
/// let msg1 = 201u64;
/// let msg2 = 1u64;
@@ -624,14 +631,14 @@ impl CudaServerKey {
/// let ct2 = cks.encrypt(msg2);
///
/// // Copy to GPU
/// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream);
/// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream);
/// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams);
/// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams);
///
/// // Compute homomorphically a bitwise and:
/// let d_ct_res = sks.bitxor(&d_ct1, &d_ct2, &mut stream);
/// let d_ct_res = sks.bitxor(&d_ct1, &d_ct2, &mut streams);
///
/// // Copy back to CPU
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
///
/// // Decrypt:
/// let dec: u64 = cks.decrypt(&ct_res);
@@ -641,22 +648,22 @@ impl CudaServerKey {
&self,
ct_left: &T,
ct_right: &T,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T {
let mut result = unsafe { ct_left.duplicate_async(stream) };
self.bitxor_assign(&mut result, ct_right, stream);
let mut result = unsafe { ct_left.duplicate_async(streams) };
self.bitxor_assign(&mut result, ct_right, streams);
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn bitxor_assign_async<T: CudaIntegerRadixCiphertext>(
&self,
ct_left: &mut T,
ct_right: &T,
stream: &CudaStreams,
streams: &CudaStreams,
) {
let mut tmp_rhs;
@@ -666,36 +673,36 @@ impl CudaServerKey {
) {
(true, true) => (ct_left, ct_right),
(true, false) => {
tmp_rhs = ct_right.duplicate_async(stream);
self.full_propagate_assign_async(&mut tmp_rhs, stream);
tmp_rhs = ct_right.duplicate_async(streams);
self.full_propagate_assign_async(&mut tmp_rhs, streams);
(ct_left, &tmp_rhs)
}
(false, true) => {
self.full_propagate_assign_async(ct_left, stream);
self.full_propagate_assign_async(ct_left, streams);
(ct_left, ct_right)
}
(false, false) => {
tmp_rhs = ct_right.duplicate_async(stream);
tmp_rhs = ct_right.duplicate_async(streams);
self.full_propagate_assign_async(ct_left, stream);
self.full_propagate_assign_async(&mut tmp_rhs, stream);
self.full_propagate_assign_async(ct_left, streams);
self.full_propagate_assign_async(&mut tmp_rhs, streams);
(ct_left, &tmp_rhs)
}
};
self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::Xor, stream);
self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::Xor, streams);
}
pub fn bitxor_assign<T: CudaIntegerRadixCiphertext>(
&self,
ct_left: &mut T,
ct_right: &T,
stream: &CudaStreams,
streams: &CudaStreams,
) {
unsafe {
self.bitxor_assign_async(ct_left, ct_right, stream);
self.bitxor_assign_async(ct_left, ct_right, streams);
}
stream.synchronize();
streams.synchronize();
}
/// Computes homomorphically bitnot for an encrypted integer value.
@@ -716,55 +723,55 @@ impl CudaServerKey {
/// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
///
/// let gpu_index = 0;
/// let mut stream = CudaStreams::new_single_gpu(gpu_index);
/// let mut streams = CudaStreams::new_single_gpu(gpu_index);
///
/// // We have 4 * 2 = 8 bits of message
/// let size = 4;
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
///
/// let msg = 1u64;
///
/// let ct = cks.encrypt(msg);
///
/// // Copy to GPU
/// let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut stream);
/// let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut streams);
///
/// // Compute homomorphically a bitwise and:
/// let d_ct_res = sks.bitnot(&d_ct, &mut stream);
/// let d_ct_res = sks.bitnot(&d_ct, &mut streams);
///
/// // Copy back to CPU
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
///
/// // Decrypt:
/// let dec: u64 = cks.decrypt(&ct_res);
/// assert_eq!(dec, !msg % 256);
/// ```
pub fn bitnot<T: CudaIntegerRadixCiphertext>(&self, ct: &T, stream: &CudaStreams) -> T {
let mut result = unsafe { ct.duplicate_async(stream) };
self.bitnot_assign(&mut result, stream);
pub fn bitnot<T: CudaIntegerRadixCiphertext>(&self, ct: &T, streams: &CudaStreams) -> T {
let mut result = unsafe { ct.duplicate_async(streams) };
self.bitnot_assign(&mut result, streams);
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn bitnot_assign_async<T: CudaIntegerRadixCiphertext>(
&self,
ct: &mut T,
stream: &CudaStreams,
streams: &CudaStreams,
) {
if !ct.block_carries_are_empty() {
self.full_propagate_assign_async(ct, stream);
self.full_propagate_assign_async(ct, streams);
}
self.unchecked_bitnot_assign_async(ct, stream);
self.unchecked_bitnot_assign_async(ct, streams);
}
pub fn bitnot_assign<T: CudaIntegerRadixCiphertext>(&self, ct: &mut T, stream: &CudaStreams) {
pub fn bitnot_assign<T: CudaIntegerRadixCiphertext>(&self, ct: &mut T, streams: &CudaStreams) {
unsafe {
self.bitnot_assign_async(ct, stream);
self.bitnot_assign_async(ct, streams);
}
stream.synchronize();
streams.synchronize();
}
}

View File

@@ -14,7 +14,8 @@ use crate::integer::gpu::ciphertext::{
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{
apply_univariate_lut_kb_async, full_propagate_assign_async,
propagate_single_carry_assign_async, CudaServerKey, PBSType,
propagate_single_carry_assign_async, propagate_single_carry_get_input_carries_assign_async,
CudaServerKey, PBSType,
};
use crate::shortint::ciphertext::{Degree, NoiseLevel};
use crate::shortint::engine::fill_accumulator;
@@ -224,6 +225,80 @@ impl CudaServerKey {
carry_out
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronized
#[allow(dead_code)]
pub(crate) unsafe fn propagate_single_carry_get_input_carries_assign_async<T>(
&self,
ct: &mut T,
input_carries: &mut T,
streams: &CudaStreams,
) -> T
where
T: CudaIntegerRadixCiphertext,
{
let mut carry_out: T = self.create_trivial_zero_radix(1, streams);
let ciphertext = ct.as_mut();
let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32;
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
propagate_single_carry_get_input_carries_assign_async(
streams,
&mut ciphertext.d_blocks.0.d_vec,
&mut carry_out.as_mut().d_blocks.0.d_vec,
&mut input_carries.as_mut().d_blocks.0.d_vec,
&d_bsk.d_vec,
&self.key_switching_key.d_vec,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
num_blocks,
ciphertext.info.blocks.first().unwrap().message_modulus,
ciphertext.info.blocks.first().unwrap().carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
propagate_single_carry_get_input_carries_assign_async(
streams,
&mut ciphertext.d_blocks.0.d_vec,
&mut carry_out.as_mut().d_blocks.0.d_vec,
&mut input_carries.as_mut().d_blocks.0.d_vec,
&d_multibit_bsk.d_vec,
&self.key_switching_key.d_vec,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
d_multibit_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count(),
d_multibit_bsk.decomp_base_log(),
num_blocks,
ciphertext.info.blocks.first().unwrap().message_modulus,
ciphertext.info.blocks.first().unwrap().carry_modulus,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
);
}
};
ciphertext.info.blocks.iter_mut().for_each(|b| {
b.degree = Degree::new(b.message_modulus.0 - 1);
b.noise_level = NoiseLevel::NOMINAL;
});
carry_out.as_mut().info.blocks.iter_mut().for_each(|b| {
b.degree = Degree::new(1);
b.noise_level = NoiseLevel::NOMINAL;
});
carry_out
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
@@ -327,25 +402,26 @@ impl CudaServerKey {
&self,
ct: &T,
num_blocks: usize,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T {
let new_num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 + num_blocks;
let ciphertext_modulus = ct.as_ref().d_blocks.ciphertext_modulus();
let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size();
let shift = num_blocks * lwe_size.0;
let mut extended_ct_vec =
unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream, 0) };
let mut extended_ct_vec = unsafe {
CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0])
};
unsafe {
extended_ct_vec.memset_async(0u64, stream, 0);
extended_ct_vec.memset_async(0u64, streams, streams.gpu_indexes[0]);
extended_ct_vec.copy_self_range_gpu_to_gpu_async(
shift..,
&ct.as_ref().d_blocks.0.d_vec,
stream,
streams,
0,
);
}
stream.synchronize();
streams.synchronize();
let extended_ct_list = CudaLweCiphertextList::from_cuda_vec(
extended_ct_vec,
LweCiphertextCount(new_num_blocks),
@@ -398,19 +474,24 @@ impl CudaServerKey {
&self,
ct: &T,
num_blocks: usize,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T {
let new_num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 + num_blocks;
let ciphertext_modulus = ct.as_ref().d_blocks.ciphertext_modulus();
let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size();
let mut extended_ct_vec =
unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream, 0) };
let mut extended_ct_vec = unsafe {
CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0])
};
unsafe {
extended_ct_vec.memset_async(0u64, stream, 0);
extended_ct_vec.copy_from_gpu_async(&ct.as_ref().d_blocks.0.d_vec, stream, 0);
extended_ct_vec.memset_async(0u64, streams, streams.gpu_indexes[0]);
extended_ct_vec.copy_from_gpu_async(
&ct.as_ref().d_blocks.0.d_vec,
streams,
streams.gpu_indexes[0],
);
}
stream.synchronize();
streams.synchronize();
let extended_ct_list = CudaLweCiphertextList::from_cuda_vec(
extended_ct_vec,
LweCiphertextCount(new_num_blocks),
@@ -463,24 +544,25 @@ impl CudaServerKey {
&self,
ct: &T,
num_blocks: usize,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T {
let new_num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 - num_blocks;
let ciphertext_modulus = ct.as_ref().d_blocks.ciphertext_modulus();
let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size();
let shift = num_blocks * lwe_size.0;
let mut trimmed_ct_vec =
unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream, 0) };
let mut trimmed_ct_vec = unsafe {
CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0])
};
unsafe {
trimmed_ct_vec.copy_src_range_gpu_to_gpu_async(
shift..,
&ct.as_ref().d_blocks.0.d_vec,
stream,
streams,
0,
);
}
stream.synchronize();
streams.synchronize();
let trimmed_ct_list = CudaLweCiphertextList::from_cuda_vec(
trimmed_ct_vec,
LweCiphertextCount(new_num_blocks),
@@ -530,24 +612,25 @@ impl CudaServerKey {
&self,
ct: &T,
num_blocks: usize,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T {
let new_num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 - num_blocks;
let ciphertext_modulus = ct.as_ref().d_blocks.ciphertext_modulus();
let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size();
let shift = new_num_blocks * lwe_size.0;
let mut trimmed_ct_vec =
unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream, 0) };
let mut trimmed_ct_vec = unsafe {
CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0])
};
unsafe {
trimmed_ct_vec.copy_src_range_gpu_to_gpu_async(
0..shift,
&ct.as_ref().d_blocks.0.d_vec,
stream,
streams,
0,
);
}
stream.synchronize();
streams.synchronize();
let trimmed_ct_list = CudaLweCiphertextList::from_cuda_vec(
trimmed_ct_vec,
LweCiphertextCount(new_num_blocks),
@@ -594,7 +677,7 @@ impl CudaServerKey {
&self,
ct: &T,
num_blocks: usize,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T {
let message_modulus = self.message_modulus.0 as u64;
let num_bits_in_block = message_modulus.ilog2();
@@ -612,28 +695,40 @@ impl CudaServerKey {
let lwe_size = ct.as_ref().d_blocks.0.lwe_dimension.to_lwe_size().0;
// Allocate the necessary amount of memory
let mut output_radix = CudaVec::new(new_num_ct_blocks * lwe_size, stream, 0);
let mut output_radix = CudaVec::new(
new_num_ct_blocks * lwe_size,
streams,
streams.gpu_indexes[0],
);
unsafe {
output_radix.copy_from_gpu_async(&ct.as_ref().d_blocks.0.d_vec, stream, 0);
output_radix.copy_from_gpu_async(
&ct.as_ref().d_blocks.0.d_vec,
streams,
streams.gpu_indexes[0],
);
// Get the last ct block
let last_block = ct
.as_ref()
.d_blocks
.0
.d_vec
.as_slice(lwe_size * (num_ct_blocks - 1).., 0)
.as_slice(lwe_size * (num_ct_blocks - 1).., streams.gpu_indexes[0])
.unwrap();
let mut output_slice = output_radix
.as_mut_slice(lwe_size * num_ct_blocks..lwe_size * new_num_ct_blocks, 0)
.as_mut_slice(
lwe_size * num_ct_blocks..lwe_size * new_num_ct_blocks,
streams.gpu_indexes[0],
)
.unwrap();
let (padding_block, new_blocks) = output_slice.split_at_mut(lwe_size, 0);
let (padding_block, new_blocks) =
output_slice.split_at_mut(lwe_size, streams.gpu_indexes[0]);
let mut padding_block = padding_block.unwrap();
let mut new_blocks = new_blocks.unwrap();
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
apply_univariate_lut_kb_async(
stream,
streams,
&mut padding_block,
&last_block,
padding_block_creator_lut.acc.as_ref(),
@@ -657,7 +752,7 @@ impl CudaServerKey {
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
apply_univariate_lut_kb_async(
stream,
streams,
&mut padding_block,
&last_block,
padding_block_creator_lut.acc.as_ref(),
@@ -682,12 +777,12 @@ impl CudaServerKey {
}
for i in 0..num_blocks - 1 {
let mut output_block = new_blocks
.get_mut(lwe_size * i..lwe_size * (i + 1), 0)
.get_mut(lwe_size * i..lwe_size * (i + 1), streams.gpu_indexes[0])
.unwrap();
output_block.copy_from_gpu_async(&padding_block, stream, 0);
output_block.copy_from_gpu_async(&padding_block, streams, streams.gpu_indexes[0]);
}
}
stream.synchronize();
streams.synchronize();
let output_lwe_list = CudaLweCiphertextList(CudaLweList {
d_vec: output_radix,
lwe_ciphertext_count: LweCiphertextCount(new_num_ct_blocks),

View File

@@ -11,14 +11,14 @@ use crate::integer::gpu::{
impl CudaServerKey {
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn unchecked_scalar_bitop_assign_async<Scalar, T>(
&self,
ct: &mut T,
rhs: Scalar,
op: BitOpType,
stream: &CudaStreams,
streams: &CudaStreams,
) where
Scalar: DecomposableInto<u8>,
T: CudaIntegerRadixCiphertext,
@@ -31,12 +31,13 @@ impl CudaServerKey {
.map(|x| x as u64)
.collect::<Vec<_>>();
let clear_blocks = CudaVec::from_cpu_async(&h_clear_blocks, stream, 0);
let clear_blocks =
CudaVec::from_cpu_async(&h_clear_blocks, streams, streams.gpu_indexes[0]);
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_scalar_bitop_integer_radix_kb_assign_async(
stream,
streams,
&mut ct.as_mut().d_blocks.0.d_vec,
&clear_blocks,
&d_bsk.d_vec,
@@ -63,7 +64,7 @@ impl CudaServerKey {
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_scalar_bitop_integer_radix_kb_assign_async(
stream,
streams,
&mut ct.as_mut().d_blocks.0.d_vec,
&clear_blocks,
&d_multibit_bsk.d_vec,
@@ -91,13 +92,18 @@ impl CudaServerKey {
}
}
pub fn unchecked_scalar_bitand<Scalar, T>(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T
pub fn unchecked_scalar_bitand<Scalar, T>(
&self,
ct: &T,
rhs: Scalar,
streams: &CudaStreams,
) -> T
where
Scalar: DecomposableInto<u8>,
T: CudaIntegerRadixCiphertext,
{
let mut result = unsafe { ct.duplicate_async(stream) };
self.unchecked_scalar_bitand_assign(&mut result, rhs, stream);
let mut result = unsafe { ct.duplicate_async(streams) };
self.unchecked_scalar_bitand_assign(&mut result, rhs, streams);
result
}
@@ -105,25 +111,25 @@ impl CudaServerKey {
&self,
ct: &mut T,
rhs: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) where
Scalar: DecomposableInto<u8>,
T: CudaIntegerRadixCiphertext,
{
unsafe {
self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarAnd, stream);
self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarAnd, streams);
ct.as_mut().info = ct.as_ref().info.after_scalar_bitand(rhs);
}
stream.synchronize();
streams.synchronize();
}
pub fn unchecked_scalar_bitor<Scalar, T>(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T
pub fn unchecked_scalar_bitor<Scalar, T>(&self, ct: &T, rhs: Scalar, streams: &CudaStreams) -> T
where
Scalar: DecomposableInto<u8>,
T: CudaIntegerRadixCiphertext,
{
let mut result = unsafe { ct.duplicate_async(stream) };
self.unchecked_scalar_bitor_assign(&mut result, rhs, stream);
let mut result = unsafe { ct.duplicate_async(streams) };
self.unchecked_scalar_bitor_assign(&mut result, rhs, streams);
result
}
@@ -131,25 +137,30 @@ impl CudaServerKey {
&self,
ct: &mut T,
rhs: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) where
Scalar: DecomposableInto<u8>,
T: CudaIntegerRadixCiphertext,
{
unsafe {
self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarOr, stream);
self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarOr, streams);
ct.as_mut().info = ct.as_ref().info.after_scalar_bitor(rhs);
}
stream.synchronize();
streams.synchronize();
}
pub fn unchecked_scalar_bitxor<Scalar, T>(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T
pub fn unchecked_scalar_bitxor<Scalar, T>(
&self,
ct: &T,
rhs: Scalar,
streams: &CudaStreams,
) -> T
where
Scalar: DecomposableInto<u8>,
T: CudaIntegerRadixCiphertext,
{
let mut result = unsafe { ct.duplicate_async(stream) };
self.unchecked_scalar_bitxor_assign(&mut result, rhs, stream);
let mut result = unsafe { ct.duplicate_async(streams) };
self.unchecked_scalar_bitxor_assign(&mut result, rhs, streams);
result
}
@@ -157,138 +168,138 @@ impl CudaServerKey {
&self,
ct: &mut T,
rhs: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) where
Scalar: DecomposableInto<u8>,
T: CudaIntegerRadixCiphertext,
{
unsafe {
self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarXor, stream);
self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarXor, streams);
ct.as_mut().info = ct.as_ref().info.after_scalar_bitxor(rhs);
}
stream.synchronize();
streams.synchronize();
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn scalar_bitand_assign_async<Scalar, T>(
&self,
ct: &mut T,
rhs: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) where
Scalar: DecomposableInto<u8>,
T: CudaIntegerRadixCiphertext,
{
if !ct.block_carries_are_empty() {
self.full_propagate_assign_async(ct, stream);
self.full_propagate_assign_async(ct, streams);
}
self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarAnd, stream);
self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarAnd, streams);
ct.as_mut().info = ct.as_ref().info.after_scalar_bitand(rhs);
}
pub fn scalar_bitand_assign<Scalar, T>(&self, ct: &mut T, rhs: Scalar, stream: &CudaStreams)
pub fn scalar_bitand_assign<Scalar, T>(&self, ct: &mut T, rhs: Scalar, streams: &CudaStreams)
where
Scalar: DecomposableInto<u8>,
T: CudaIntegerRadixCiphertext,
{
unsafe {
self.scalar_bitand_assign_async(ct, rhs, stream);
self.scalar_bitand_assign_async(ct, rhs, streams);
}
stream.synchronize();
streams.synchronize();
}
pub fn scalar_bitand<Scalar, T>(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T
pub fn scalar_bitand<Scalar, T>(&self, ct: &T, rhs: Scalar, streams: &CudaStreams) -> T
where
Scalar: DecomposableInto<u8>,
T: CudaIntegerRadixCiphertext,
{
let mut result = unsafe { ct.duplicate_async(stream) };
self.scalar_bitand_assign(&mut result, rhs, stream);
let mut result = unsafe { ct.duplicate_async(streams) };
self.scalar_bitand_assign(&mut result, rhs, streams);
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn scalar_bitor_assign_async<Scalar, T>(
&self,
ct: &mut T,
rhs: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) where
Scalar: DecomposableInto<u8>,
T: CudaIntegerRadixCiphertext,
{
if !ct.block_carries_are_empty() {
self.full_propagate_assign_async(ct, stream);
self.full_propagate_assign_async(ct, streams);
}
self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarOr, stream);
self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarOr, streams);
ct.as_mut().info = ct.as_ref().info.after_scalar_bitor(rhs);
}
pub fn scalar_bitor_assign<Scalar, T>(&self, ct: &mut T, rhs: Scalar, stream: &CudaStreams)
pub fn scalar_bitor_assign<Scalar, T>(&self, ct: &mut T, rhs: Scalar, streams: &CudaStreams)
where
Scalar: DecomposableInto<u8>,
T: CudaIntegerRadixCiphertext,
{
unsafe {
self.scalar_bitor_assign_async(ct, rhs, stream);
self.scalar_bitor_assign_async(ct, rhs, streams);
}
stream.synchronize();
streams.synchronize();
}
pub fn scalar_bitor<Scalar, T>(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T
pub fn scalar_bitor<Scalar, T>(&self, ct: &T, rhs: Scalar, streams: &CudaStreams) -> T
where
Scalar: DecomposableInto<u8>,
T: CudaIntegerRadixCiphertext,
{
let mut result = unsafe { ct.duplicate_async(stream) };
self.scalar_bitor_assign(&mut result, rhs, stream);
let mut result = unsafe { ct.duplicate_async(streams) };
self.scalar_bitor_assign(&mut result, rhs, streams);
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn scalar_bitxor_assign_async<Scalar, T>(
&self,
ct: &mut T,
rhs: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) where
Scalar: DecomposableInto<u8>,
T: CudaIntegerRadixCiphertext,
{
if !ct.block_carries_are_empty() {
self.full_propagate_assign_async(ct, stream);
self.full_propagate_assign_async(ct, streams);
}
self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarXor, stream);
self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarXor, streams);
ct.as_mut().info = ct.as_ref().info.after_scalar_bitxor(rhs);
}
pub fn scalar_bitxor_assign<Scalar, T>(&self, ct: &mut T, rhs: Scalar, stream: &CudaStreams)
pub fn scalar_bitxor_assign<Scalar, T>(&self, ct: &mut T, rhs: Scalar, streams: &CudaStreams)
where
Scalar: DecomposableInto<u8>,
T: CudaIntegerRadixCiphertext,
{
unsafe {
self.scalar_bitxor_assign_async(ct, rhs, stream);
self.scalar_bitxor_assign_async(ct, rhs, streams);
}
stream.synchronize();
streams.synchronize();
}
pub fn scalar_bitxor<Scalar, T>(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T
pub fn scalar_bitxor<Scalar, T>(&self, ct: &T, rhs: Scalar, streams: &CudaStreams) -> T
where
Scalar: DecomposableInto<u8>,
T: CudaIntegerRadixCiphertext,
{
let mut result = unsafe { ct.duplicate_async(stream) };
self.scalar_bitxor_assign(&mut result, rhs, stream);
let mut result = unsafe { ct.duplicate_async(streams) };
self.scalar_bitxor_assign(&mut result, rhs, streams);
result
}
}

View File

@@ -102,15 +102,15 @@ impl CudaServerKey {
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn unchecked_signed_and_unsigned_scalar_comparison_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
op: ComparisonType,
signed_with_positive_scalar: bool,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
@@ -122,7 +122,7 @@ impl CudaServerKey {
ComparisonType::GT | ComparisonType::GE | ComparisonType::NE => 1,
_ => 0,
};
let ct_res: T = self.create_trivial_radix(value, 1, stream);
let ct_res: T = self.create_trivial_radix(value, 1, streams);
return CudaBooleanBlock::from_cuda_radix_ciphertext(ct_res.into_inner());
}
@@ -144,7 +144,7 @@ impl CudaServerKey {
ComparisonType::LT | ComparisonType::LE | ComparisonType::NE => 1,
_ => 0,
};
let ct_res: T = self.create_trivial_radix(value, 1, stream);
let ct_res: T = self.create_trivial_radix(value, 1, streams);
return CudaBooleanBlock::from_cuda_radix_ciphertext(ct_res.into_inner());
}
@@ -153,7 +153,8 @@ impl CudaServerKey {
// as we will handle them separately.
scalar_blocks.truncate(ct.as_ref().d_blocks.lwe_ciphertext_count().0);
let d_scalar_blocks: CudaVec<u64> = CudaVec::from_cpu_async(&scalar_blocks, stream, 0);
let d_scalar_blocks: CudaVec<u64> =
CudaVec::from_cpu_async(&scalar_blocks, streams, streams.gpu_indexes[0]);
let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count();
@@ -161,7 +162,7 @@ impl CudaServerKey {
ct.as_ref().d_blocks.lwe_dimension(),
LweCiphertextCount(1),
CiphertextModulus::new_native(),
stream,
streams,
);
let mut block_info = ct.as_ref().info.blocks[0];
block_info.degree = Degree::new(0);
@@ -174,7 +175,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_scalar_comparison_integer_radix_kb_async(
stream,
streams,
&mut result.as_mut().ciphertext.d_blocks.0.d_vec,
&ct.as_ref().d_blocks.0.d_vec,
&d_scalar_blocks,
@@ -204,7 +205,7 @@ impl CudaServerKey {
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_scalar_comparison_integer_radix_kb_async(
stream,
streams,
&mut result.as_mut().ciphertext.d_blocks.0.d_vec,
&ct.as_ref().d_blocks.0.d_vec,
&d_scalar_blocks,
@@ -239,14 +240,14 @@ impl CudaServerKey {
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn unchecked_scalar_comparison_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
op: ComparisonType,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
@@ -260,12 +261,12 @@ impl CudaServerKey {
// Scalar is greater than the bounds, so ciphertext is smaller
let result: T = match op {
ComparisonType::LT | ComparisonType::LE => {
self.create_trivial_radix(1, num_blocks, stream)
self.create_trivial_radix(1, num_blocks, streams)
}
_ => self.create_trivial_radix(
0,
ct.as_ref().d_blocks.lwe_ciphertext_count().0,
stream,
streams,
),
};
return CudaBooleanBlock::from_cuda_radix_ciphertext(result.into_inner());
@@ -274,12 +275,12 @@ impl CudaServerKey {
// Scalar is smaller than the bounds, so ciphertext is bigger
let result: T = match op {
ComparisonType::GT | ComparisonType::GE => {
self.create_trivial_radix(1, num_blocks, stream)
self.create_trivial_radix(1, num_blocks, streams)
}
_ => self.create_trivial_radix(
0,
ct.as_ref().d_blocks.lwe_ciphertext_count().0,
stream,
streams,
),
};
return CudaBooleanBlock::from_cuda_radix_ciphertext(result.into_inner());
@@ -292,29 +293,29 @@ impl CudaServerKey {
if scalar >= Scalar::ZERO {
self.unchecked_signed_and_unsigned_scalar_comparison_async(
ct, scalar, op, true, stream,
ct, scalar, op, true, streams,
)
} else {
let scalar_as_trivial = self.create_trivial_radix(scalar, num_blocks, stream);
self.unchecked_comparison_async(ct, &scalar_as_trivial, op, stream)
let scalar_as_trivial = self.create_trivial_radix(scalar, num_blocks, streams);
self.unchecked_comparison_async(ct, &scalar_as_trivial, op, streams)
}
} else {
// Unsigned
self.unchecked_signed_and_unsigned_scalar_comparison_async(
ct, scalar, op, false, stream,
ct, scalar, op, false, streams,
)
}
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn unchecked_scalar_minmax_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
op: ComparisonType,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T
where
T: CudaIntegerRadixCiphertext,
@@ -327,16 +328,17 @@ impl CudaServerKey {
.iter_as::<u64>()
.collect::<Vec<_>>();
let d_scalar_blocks: CudaVec<u64> = CudaVec::from_cpu_async(&scalar_blocks, stream, 0);
let d_scalar_blocks: CudaVec<u64> =
CudaVec::from_cpu_async(&scalar_blocks, streams, streams.gpu_indexes[0]);
let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count();
let mut result = ct.duplicate_async(stream);
let mut result = ct.duplicate_async(streams);
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_scalar_comparison_integer_radix_kb_async(
stream,
streams,
&mut result.as_mut().d_blocks.0.d_vec,
&ct.as_ref().d_blocks.0.d_vec,
&d_scalar_blocks,
@@ -366,7 +368,7 @@ impl CudaServerKey {
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_scalar_comparison_integer_radix_kb_async(
stream,
streams,
&mut result.as_mut().d_blocks.0.d_vec,
&ct.as_ref().d_blocks.0.d_vec,
&d_scalar_blocks,
@@ -401,45 +403,45 @@ impl CudaServerKey {
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn unchecked_scalar_eq_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
T: CudaIntegerRadixCiphertext,
Scalar: DecomposableInto<u64>,
{
self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::EQ, stream)
self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::EQ, streams)
}
pub fn unchecked_scalar_eq<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
T: CudaIntegerRadixCiphertext,
Scalar: DecomposableInto<u64>,
{
let result = unsafe { self.unchecked_scalar_eq_async(ct, scalar, stream) };
stream.synchronize();
let result = unsafe { self.unchecked_scalar_eq_async(ct, scalar, streams) };
streams.synchronize();
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn scalar_eq_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
T: CudaIntegerRadixCiphertext,
@@ -449,12 +451,12 @@ impl CudaServerKey {
let lhs = if ct.block_carries_are_empty() {
ct
} else {
tmp_lhs = ct.duplicate_async(stream);
self.full_propagate_assign_async(&mut tmp_lhs, stream);
tmp_lhs = ct.duplicate_async(streams);
self.full_propagate_assign_async(&mut tmp_lhs, streams);
&tmp_lhs
};
self.unchecked_scalar_eq_async(lhs, scalar, stream)
self.unchecked_scalar_eq_async(lhs, scalar, streams)
}
/// Compares for equality 2 ciphertexts
@@ -473,12 +475,12 @@ impl CudaServerKey {
/// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
///
/// let gpu_index = 0;
/// let mut stream = CudaStreams::new_single_gpu(gpu_index);
/// let mut streams = CudaStreams::new_single_gpu(gpu_index);
///
/// let size = 4;
///
/// // Generate the client key and the server key:
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &stream);
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &streams);
///
/// let msg1 = 14u64;
/// let msg2 = 97u64;
@@ -486,12 +488,12 @@ impl CudaServerKey {
/// let ct1 = cks.encrypt(msg1);
///
/// // Copy to GPU
/// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &stream);
/// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams);
///
/// let d_ct_res = sks.scalar_eq(&d_ct1, msg2, &stream);
/// let d_ct_res = sks.scalar_eq(&d_ct1, msg2, &streams);
///
/// // Copy the result back to CPU
/// let ct_res = d_ct_res.to_boolean_block(&stream);
/// let ct_res = d_ct_res.to_boolean_block(&streams);
///
/// // Decrypt:
/// let dec_result = cks.decrypt_bool(&ct_res);
@@ -501,26 +503,26 @@ impl CudaServerKey {
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
T: CudaIntegerRadixCiphertext,
Scalar: DecomposableInto<u64>,
{
let result = unsafe { self.scalar_eq_async(ct, scalar, stream) };
stream.synchronize();
let result = unsafe { self.scalar_eq_async(ct, scalar, streams) };
streams.synchronize();
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn scalar_ne_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
T: CudaIntegerRadixCiphertext,
@@ -530,12 +532,12 @@ impl CudaServerKey {
let lhs = if ct.block_carries_are_empty() {
ct
} else {
tmp_lhs = ct.duplicate_async(stream);
self.full_propagate_assign_async(&mut tmp_lhs, stream);
tmp_lhs = ct.duplicate_async(streams);
self.full_propagate_assign_async(&mut tmp_lhs, streams);
&tmp_lhs
};
self.unchecked_scalar_ne_async(lhs, scalar, stream)
self.unchecked_scalar_ne_async(lhs, scalar, streams)
}
/// Compares for equality 2 ciphertexts
@@ -554,12 +556,12 @@ impl CudaServerKey {
/// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
///
/// let gpu_index = 0;
/// let mut stream = CudaStreams::new_single_gpu(gpu_index);
/// let mut streams = CudaStreams::new_single_gpu(gpu_index);
///
/// let size = 4;
///
/// // Generate the client key and the server key:
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &stream);
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &streams);
///
/// let msg1 = 14u64;
/// let msg2 = 97u64;
@@ -567,12 +569,12 @@ impl CudaServerKey {
/// let ct1 = cks.encrypt(msg1);
///
/// // Copy to GPU
/// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &stream);
/// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams);
///
/// let d_ct_res = sks.scalar_ne(&d_ct1, msg2, &stream);
/// let d_ct_res = sks.scalar_ne(&d_ct1, msg2, &streams);
///
/// // Copy the result back to CPU
/// let ct_res = d_ct_res.to_boolean_block(&stream);
/// let ct_res = d_ct_res.to_boolean_block(&streams);
///
/// // Decrypt:
/// let dec_result = cks.decrypt_bool(&ct_res);
@@ -582,185 +584,185 @@ impl CudaServerKey {
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
let result = unsafe { self.scalar_ne_async(ct, scalar, stream) };
stream.synchronize();
let result = unsafe { self.scalar_ne_async(ct, scalar, streams) };
streams.synchronize();
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn unchecked_scalar_ne_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
T: CudaIntegerRadixCiphertext,
Scalar: DecomposableInto<u64>,
{
self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::NE, stream)
self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::NE, streams)
}
pub fn unchecked_scalar_ne<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
T: CudaIntegerRadixCiphertext,
Scalar: DecomposableInto<u64>,
{
let result = unsafe { self.unchecked_scalar_ne_async(ct, scalar, stream) };
stream.synchronize();
let result = unsafe { self.unchecked_scalar_ne_async(ct, scalar, streams) };
streams.synchronize();
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn unchecked_scalar_gt_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::GT, stream)
self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::GT, streams)
}
pub fn unchecked_scalar_gt<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
let result = unsafe { self.unchecked_scalar_gt_async(ct, scalar, stream) };
stream.synchronize();
let result = unsafe { self.unchecked_scalar_gt_async(ct, scalar, streams) };
streams.synchronize();
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn unchecked_scalar_ge_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::GE, stream)
self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::GE, streams)
}
pub fn unchecked_scalar_ge<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
let result = unsafe { self.unchecked_scalar_ge_async(ct, scalar, stream) };
stream.synchronize();
let result = unsafe { self.unchecked_scalar_ge_async(ct, scalar, streams) };
streams.synchronize();
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn unchecked_scalar_lt_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::LT, stream)
self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::LT, streams)
}
pub fn unchecked_scalar_lt<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
let result = unsafe { self.unchecked_scalar_lt_async(ct, scalar, stream) };
stream.synchronize();
let result = unsafe { self.unchecked_scalar_lt_async(ct, scalar, streams) };
streams.synchronize();
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn unchecked_scalar_le_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::LE, stream)
self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::LE, streams)
}
pub fn unchecked_scalar_le<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
let result = unsafe { self.unchecked_scalar_le_async(ct, scalar, stream) };
stream.synchronize();
let result = unsafe { self.unchecked_scalar_le_async(ct, scalar, streams) };
streams.synchronize();
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn scalar_gt_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
@@ -770,38 +772,38 @@ impl CudaServerKey {
let lhs = if ct.block_carries_are_empty() {
ct
} else {
tmp_lhs = ct.duplicate_async(stream);
self.full_propagate_assign_async(&mut tmp_lhs, stream);
tmp_lhs = ct.duplicate_async(streams);
self.full_propagate_assign_async(&mut tmp_lhs, streams);
&tmp_lhs
};
self.unchecked_scalar_gt_async(lhs, scalar, stream)
self.unchecked_scalar_gt_async(lhs, scalar, streams)
}
pub fn scalar_gt<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
let result = unsafe { self.scalar_gt_async(ct, scalar, stream) };
stream.synchronize();
let result = unsafe { self.scalar_gt_async(ct, scalar, streams) };
streams.synchronize();
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn scalar_ge_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
@@ -811,38 +813,38 @@ impl CudaServerKey {
let lhs = if ct.block_carries_are_empty() {
ct
} else {
tmp_lhs = ct.duplicate_async(stream);
self.full_propagate_assign_async(&mut tmp_lhs, stream);
tmp_lhs = ct.duplicate_async(streams);
self.full_propagate_assign_async(&mut tmp_lhs, streams);
&tmp_lhs
};
self.unchecked_scalar_ge_async(lhs, scalar, stream)
self.unchecked_scalar_ge_async(lhs, scalar, streams)
}
pub fn scalar_ge<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
let result = unsafe { self.scalar_ge_async(ct, scalar, stream) };
stream.synchronize();
let result = unsafe { self.scalar_ge_async(ct, scalar, streams) };
streams.synchronize();
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn scalar_lt_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
@@ -852,37 +854,37 @@ impl CudaServerKey {
let lhs = if ct.block_carries_are_empty() {
ct
} else {
tmp_lhs = ct.duplicate_async(stream);
self.full_propagate_assign_async(&mut tmp_lhs, stream);
tmp_lhs = ct.duplicate_async(streams);
self.full_propagate_assign_async(&mut tmp_lhs, streams);
&tmp_lhs
};
self.unchecked_scalar_lt_async(lhs, scalar, stream)
self.unchecked_scalar_lt_async(lhs, scalar, streams)
}
pub fn scalar_lt<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
let result = unsafe { self.scalar_lt_async(ct, scalar, stream) };
stream.synchronize();
let result = unsafe { self.scalar_lt_async(ct, scalar, streams) };
streams.synchronize();
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn scalar_le_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
@@ -892,92 +894,102 @@ impl CudaServerKey {
let lhs = if ct.block_carries_are_empty() {
ct
} else {
tmp_lhs = ct.duplicate_async(stream);
self.full_propagate_assign_async(&mut tmp_lhs, stream);
tmp_lhs = ct.duplicate_async(streams);
self.full_propagate_assign_async(&mut tmp_lhs, streams);
&tmp_lhs
};
self.unchecked_scalar_le_async(lhs, scalar, stream)
self.unchecked_scalar_le_async(lhs, scalar, streams)
}
pub fn scalar_le<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> CudaBooleanBlock
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
let result = unsafe { self.scalar_le_async(ct, scalar, stream) };
stream.synchronize();
let result = unsafe { self.scalar_le_async(ct, scalar, streams) };
streams.synchronize();
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn unchecked_scalar_max_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
self.unchecked_scalar_minmax_async(ct, scalar, ComparisonType::MAX, stream)
self.unchecked_scalar_minmax_async(ct, scalar, ComparisonType::MAX, streams)
}
pub fn unchecked_scalar_max<Scalar, T>(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T
pub fn unchecked_scalar_max<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
streams: &CudaStreams,
) -> T
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
let result = unsafe { self.unchecked_scalar_max_async(ct, scalar, stream) };
stream.synchronize();
let result = unsafe { self.unchecked_scalar_max_async(ct, scalar, streams) };
streams.synchronize();
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn unchecked_scalar_min_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
self.unchecked_scalar_minmax_async(ct, scalar, ComparisonType::MIN, stream)
self.unchecked_scalar_minmax_async(ct, scalar, ComparisonType::MIN, streams)
}
pub fn unchecked_scalar_min<Scalar, T>(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T
pub fn unchecked_scalar_min<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
streams: &CudaStreams,
) -> T
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
let result = unsafe { self.unchecked_scalar_min_async(ct, scalar, stream) };
stream.synchronize();
let result = unsafe { self.unchecked_scalar_min_async(ct, scalar, streams) };
streams.synchronize();
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn scalar_max_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T
where
Scalar: DecomposableInto<u64>,
@@ -987,33 +999,33 @@ impl CudaServerKey {
let lhs = if ct.block_carries_are_empty() {
ct
} else {
tmp_lhs = ct.duplicate_async(stream);
self.full_propagate_assign_async(&mut tmp_lhs, stream);
tmp_lhs = ct.duplicate_async(streams);
self.full_propagate_assign_async(&mut tmp_lhs, streams);
&tmp_lhs
};
self.unchecked_scalar_max_async(lhs, scalar, stream)
self.unchecked_scalar_max_async(lhs, scalar, streams)
}
pub fn scalar_max<Scalar, T>(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T
pub fn scalar_max<Scalar, T>(&self, ct: &T, scalar: Scalar, streams: &CudaStreams) -> T
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
let result = unsafe { self.scalar_max_async(ct, scalar, stream) };
stream.synchronize();
let result = unsafe { self.scalar_max_async(ct, scalar, streams) };
streams.synchronize();
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn scalar_min_async<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) -> T
where
Scalar: DecomposableInto<u64>,
@@ -1023,21 +1035,21 @@ impl CudaServerKey {
let lhs = if ct.block_carries_are_empty() {
ct
} else {
tmp_lhs = ct.duplicate_async(stream);
self.full_propagate_assign_async(&mut tmp_lhs, stream);
tmp_lhs = ct.duplicate_async(streams);
self.full_propagate_assign_async(&mut tmp_lhs, streams);
&tmp_lhs
};
self.unchecked_scalar_min_async(lhs, scalar, stream)
self.unchecked_scalar_min_async(lhs, scalar, streams)
}
pub fn scalar_min<Scalar, T>(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T
pub fn scalar_min<Scalar, T>(&self, ct: &T, scalar: Scalar, streams: &CudaStreams) -> T
where
Scalar: DecomposableInto<u64>,
T: CudaIntegerRadixCiphertext,
{
let result = unsafe { self.scalar_min_async(ct, scalar, stream) };
stream.synchronize();
let result = unsafe { self.scalar_min_async(ct, scalar, streams) };
streams.synchronize();
result
}
}

View File

@@ -26,50 +26,59 @@ impl CudaServerKey {
/// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
///
/// let gpu_index = 0;
/// let mut stream = CudaStreams::new_single_gpu(gpu_index);
/// let mut streams = CudaStreams::new_single_gpu(gpu_index);
///
/// // We have 4 * 2 = 8 bits of message
/// let size = 4;
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
///
/// let msg = 30;
/// let scalar = 3;
///
/// let ct = cks.encrypt(msg);
/// let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut stream);
/// let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut streams);
///
/// // Compute homomorphically a scalar multiplication:
/// let d_ct_res = sks.unchecked_scalar_mul(&d_ct, scalar, &mut stream);
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
/// let d_ct_res = sks.unchecked_scalar_mul(&d_ct, scalar, &mut streams);
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
///
/// let clear: u64 = cks.decrypt(&ct_res);
/// assert_eq!(scalar * msg, clear);
/// ```
pub fn unchecked_scalar_mul<Scalar, T>(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T
pub fn unchecked_scalar_mul<Scalar, T>(
&self,
ct: &T,
scalar: Scalar,
streams: &CudaStreams,
) -> T
where
Scalar: ScalarMultiplier + DecomposableInto<u8> + CastInto<u64>,
T: CudaIntegerRadixCiphertext,
{
let mut result = unsafe { ct.duplicate_async(stream) };
self.unchecked_scalar_mul_assign(&mut result, scalar, stream);
let mut result = unsafe { ct.duplicate_async(streams) };
self.unchecked_scalar_mul_assign(&mut result, scalar, streams);
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn unchecked_scalar_mul_assign_async<Scalar, T>(
&self,
ct: &mut T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) where
Scalar: ScalarMultiplier + DecomposableInto<u8> + CastInto<u64>,
T: CudaIntegerRadixCiphertext,
{
if scalar == Scalar::ZERO {
ct.as_mut().d_blocks.0.d_vec.memset_async(0, stream, 0);
ct.as_mut()
.d_blocks
.0
.d_vec
.memset_async(0, streams, streams.gpu_indexes[0]);
return;
}
@@ -80,7 +89,7 @@ impl CudaServerKey {
if scalar.is_power_of_two() {
// Shifting cost one bivariate PBS so its always faster
// than multiplying
self.unchecked_scalar_left_shift_assign_async(ct, scalar.ilog2() as u64, stream);
self.unchecked_scalar_left_shift_assign_async(ct, scalar.ilog2() as u64, streams);
return;
}
let ciphertext = ct.as_mut();
@@ -104,7 +113,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_scalar_mul_integer_radix_kb_async(
stream,
streams,
&mut ct.as_mut().d_blocks.0.d_vec,
decomposed_scalar.as_slice(),
has_at_least_one_set.as_slice(),
@@ -129,7 +138,7 @@ impl CudaServerKey {
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_scalar_mul_integer_radix_kb_async(
stream,
streams,
&mut ct.as_mut().d_blocks.0.d_vec,
decomposed_scalar.as_slice(),
has_at_least_one_set.as_slice(),
@@ -161,15 +170,15 @@ impl CudaServerKey {
&self,
ct: &mut T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) where
Scalar: ScalarMultiplier + DecomposableInto<u8> + CastInto<u64>,
T: CudaIntegerRadixCiphertext,
{
unsafe {
self.unchecked_scalar_mul_assign_async(ct, scalar, stream);
self.unchecked_scalar_mul_assign_async(ct, scalar, streams);
}
stream.synchronize();
streams.synchronize();
}
/// Computes homomorphically a multiplication between a scalar and a ciphertext.
@@ -189,63 +198,63 @@ impl CudaServerKey {
/// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
///
/// let gpu_index = 0;
/// let mut stream = CudaStreams::new_single_gpu(gpu_index);
/// let mut streams = CudaStreams::new_single_gpu(gpu_index);
///
/// // We have 4 * 2 = 8 bits of message
/// let size = 4;
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
/// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
///
/// let msg = 30;
/// let scalar = 3;
///
/// let ct = cks.encrypt(msg);
/// let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut stream);
/// let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut streams);
///
/// // Compute homomorphically a scalar multiplication:
/// let d_ct_res = sks.scalar_mul(&d_ct, scalar, &mut stream);
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
/// let d_ct_res = sks.scalar_mul(&d_ct, scalar, &mut streams);
/// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
///
/// let clear: u64 = cks.decrypt(&ct_res);
/// assert_eq!(scalar * msg, clear);
/// ```
pub fn scalar_mul<Scalar, T>(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T
pub fn scalar_mul<Scalar, T>(&self, ct: &T, scalar: Scalar, streams: &CudaStreams) -> T
where
Scalar: ScalarMultiplier + DecomposableInto<u8> + CastInto<u64>,
T: CudaIntegerRadixCiphertext,
{
let mut result = unsafe { ct.duplicate_async(stream) };
self.scalar_mul_assign(&mut result, scalar, stream);
let mut result = unsafe { ct.duplicate_async(streams) };
self.scalar_mul_assign(&mut result, scalar, streams);
result
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn scalar_mul_assign_async<Scalar, T>(
&self,
ct: &mut T,
scalar: Scalar,
stream: &CudaStreams,
streams: &CudaStreams,
) where
Scalar: ScalarMultiplier + DecomposableInto<u8> + CastInto<u64>,
T: CudaIntegerRadixCiphertext,
{
if !ct.block_carries_are_empty() {
self.full_propagate_assign_async(ct, stream);
self.full_propagate_assign_async(ct, streams);
};
self.unchecked_scalar_mul_assign_async(ct, scalar, stream);
self.unchecked_scalar_mul_assign_async(ct, scalar, streams);
}
pub fn scalar_mul_assign<Scalar, T>(&self, ct: &mut T, scalar: Scalar, stream: &CudaStreams)
pub fn scalar_mul_assign<Scalar, T>(&self, ct: &mut T, scalar: Scalar, streams: &CudaStreams)
where
Scalar: ScalarMultiplier + DecomposableInto<u8> + CastInto<u64>,
T: CudaIntegerRadixCiphertext,
{
unsafe {
self.scalar_mul_assign_async(ct, scalar, stream);
self.scalar_mul_assign_async(ct, scalar, streams);
}
stream.synchronize();
streams.synchronize();
}
}