mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-04-28 03:01:21 -04:00
Compare commits
58 Commits
as/lut_cac
...
tfhe-cuda-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c8b19c3f10 | ||
|
|
7408ca1126 | ||
|
|
8712746296 | ||
|
|
271644f482 | ||
|
|
5a7650a2de | ||
|
|
a3a5b6fe40 | ||
|
|
2210d637c4 | ||
|
|
1e5a38acb0 | ||
|
|
f6b08c45e9 | ||
|
|
6145f6b680 | ||
|
|
0d6602df84 | ||
|
|
a006d5d71f | ||
|
|
a7c5f2dbf8 | ||
|
|
0816c331c5 | ||
|
|
89df44917b | ||
|
|
b218c98194 | ||
|
|
2355cf4d89 | ||
|
|
3b793273cf | ||
|
|
d3b52d92cb | ||
|
|
af7d69e16c | ||
|
|
6bb211f2fd | ||
|
|
0fcce501a2 | ||
|
|
204555e11c | ||
|
|
b00e8bafe5 | ||
|
|
140a780bb6 | ||
|
|
e646130324 | ||
|
|
f79204e23e | ||
|
|
b6b24f4fb9 | ||
|
|
41621dc4e3 | ||
|
|
662038da19 | ||
|
|
9d21aecae9 | ||
|
|
58dbdf7dd4 | ||
|
|
1a7b7ace47 | ||
|
|
7797b60ef2 | ||
|
|
56c0a9fa5e | ||
|
|
37bcb7763d | ||
|
|
ae978bedc6 | ||
|
|
43ebb14b9b | ||
|
|
641b394423 | ||
|
|
72c18cc2b2 | ||
|
|
71e95c435d | ||
|
|
057217b7d2 | ||
|
|
f006039916 | ||
|
|
69df8c0310 | ||
|
|
482e49eac5 | ||
|
|
0dcf2ace3b | ||
|
|
0483c95941 | ||
|
|
f9b292ca77 | ||
|
|
8e194c4b65 | ||
|
|
f02bba151c | ||
|
|
a477548161 | ||
|
|
45a7d5217c | ||
|
|
865c0887ee | ||
|
|
0b9df1c8cd | ||
|
|
cf20e337ef | ||
|
|
d7aea61eba | ||
|
|
9352ecce54 | ||
|
|
179b52ab26 |
@@ -41,7 +41,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -141,7 +141,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/aws_tfhe_fast_tests.yml
vendored
4
.github/workflows/aws_tfhe_fast_tests.yml
vendored
@@ -146,7 +146,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -299,7 +299,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/aws_tfhe_integer_tests.yml
vendored
4
.github/workflows/aws_tfhe_integer_tests.yml
vendored
@@ -86,7 +86,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -168,7 +168,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/aws_tfhe_noise_checks.yml
vendored
4
.github/workflows/aws_tfhe_noise_checks.yml
vendored
@@ -35,7 +35,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -100,7 +100,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -87,7 +87,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -172,7 +172,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/aws_tfhe_tests.yml
vendored
4
.github/workflows/aws_tfhe_tests.yml
vendored
@@ -155,7 +155,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -279,7 +279,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
4
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
@@ -39,7 +39,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -147,7 +147,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/benchmark_cpu_common.yml
vendored
4
.github/workflows/benchmark_cpu_common.yml
vendored
@@ -126,7 +126,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -261,7 +261,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/benchmark_ct_key_sizes.yml
vendored
4
.github/workflows/benchmark_ct_key_sizes.yml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/benchmark_gpu_common.yml
vendored
4
.github/workflows/benchmark_gpu_common.yml
vendored
@@ -134,7 +134,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -324,7 +324,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -92,7 +92,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -324,7 +324,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
2
.github/workflows/benchmark_hpu_common.yml
vendored
2
.github/workflows/benchmark_hpu_common.yml
vendored
@@ -187,7 +187,7 @@ jobs:
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.bench_type }}_${{ matrix.command }}_benchmarks
|
||||
name: ${{ github.sha }}_${{ matrix.bench_type }}_integer_benchmarks
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
|
||||
@@ -143,7 +143,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -383,7 +383,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/benchmark_tfhe_fft.yml
vendored
4
.github/workflows/benchmark_tfhe_fft.yml
vendored
@@ -37,7 +37,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -134,7 +134,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/benchmark_tfhe_ntt.yml
vendored
4
.github/workflows/benchmark_tfhe_ntt.yml
vendored
@@ -37,7 +37,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -134,7 +134,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/benchmark_wasm_client.yml
vendored
4
.github/workflows/benchmark_wasm_client.yml
vendored
@@ -71,7 +71,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -213,7 +213,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/cargo_build_common.yml
vendored
4
.github/workflows/cargo_build_common.yml
vendored
@@ -80,7 +80,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -242,7 +242,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/cargo_test_ntt.yml
vendored
4
.github/workflows/cargo_test_ntt.yml
vendored
@@ -60,7 +60,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -143,7 +143,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/code_coverage.yml
vendored
4
.github/workflows/code_coverage.yml
vendored
@@ -31,7 +31,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -130,7 +130,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -37,7 +37,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -93,7 +93,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -43,7 +43,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/gpu_fast_h100_tests.yml
vendored
4
.github/workflows/gpu_fast_h100_tests.yml
vendored
@@ -87,7 +87,7 @@ jobs:
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -198,7 +198,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/gpu_fast_tests.yml
vendored
4
.github/workflows/gpu_fast_tests.yml
vendored
@@ -79,7 +79,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -184,7 +184,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/gpu_full_h100_tests.yml
vendored
4
.github/workflows/gpu_full_h100_tests.yml
vendored
@@ -35,7 +35,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -124,7 +124,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -81,7 +81,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -187,7 +187,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -38,7 +38,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -112,7 +112,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/gpu_memory_sanitizer.yml
vendored
4
.github/workflows/gpu_memory_sanitizer.yml
vendored
@@ -42,7 +42,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -134,7 +134,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -42,7 +42,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -134,7 +134,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/gpu_pcc.yml
vendored
4
.github/workflows/gpu_pcc.yml
vendored
@@ -38,7 +38,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -159,7 +159,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -81,7 +81,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -170,7 +170,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -87,7 +87,7 @@ jobs:
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -184,7 +184,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -82,7 +82,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -179,7 +179,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -81,7 +81,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -170,7 +170,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -87,7 +87,7 @@ jobs:
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -184,7 +184,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -82,7 +82,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -179,7 +179,7 @@ jobs:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/hpu_hlapi_tests.yml
vendored
4
.github/workflows/hpu_hlapi_tests.yml
vendored
@@ -62,7 +62,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -114,7 +114,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/integer_long_run_tests.yml
vendored
4
.github/workflows/integer_long_run_tests.yml
vendored
@@ -34,7 +34,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -83,7 +83,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/make_release_cuda.yml
vendored
4
.github/workflows/make_release_cuda.yml
vendored
@@ -37,7 +37,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -221,7 +221,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/parameters_check.yml
vendored
4
.github/workflows/parameters_check.yml
vendored
@@ -38,7 +38,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
|
||||
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
67
.github/workflows/pr_milestone_check.yml
vendored
67
.github/workflows/pr_milestone_check.yml
vendored
@@ -1,67 +0,0 @@
|
||||
name: pr_milestone_check
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types: [opened, edited, synchronize, reopened, milestoned, demilestoned]
|
||||
|
||||
permissions: {}
|
||||
|
||||
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
|
||||
# external contributors workflows are manually approved
|
||||
|
||||
jobs:
|
||||
check-empty-milestone:
|
||||
name: pr_milestone_check/check-empty-milestone
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event.pull_request.milestone == null
|
||||
permissions:
|
||||
pull-requests: write # Need write access on pull requests to post comment
|
||||
|
||||
steps:
|
||||
- name: Post Reminder Comment
|
||||
uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
|
||||
with:
|
||||
route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
|
||||
body: |
|
||||
'### ❌ Milestone Missing
|
||||
|
||||
Please assign a milestone to this pull request. If your PR targets the next version of
|
||||
TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
|
||||
|
||||
If your PR targets a patch version for previous releases: consider creating a dedicated
|
||||
milestone e.g. v1.5.1 if it does not exist yet.'
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Check Final Status
|
||||
run: |
|
||||
echo "::error::Milestone is missing. This check is failing."
|
||||
exit 1
|
||||
|
||||
check-milestone-open:
|
||||
name: pr_milestone_check/check-milestone-open
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event.pull_request.milestone != null && github.event.pull_request.milestone.state == 'closed'
|
||||
permissions:
|
||||
pull-requests: write # Need write access on pull requests to post comment
|
||||
|
||||
steps:
|
||||
- name: Post Reminder Comment
|
||||
uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
|
||||
with:
|
||||
route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
|
||||
body: |
|
||||
'### ❌ Milestone is closed
|
||||
|
||||
Please assign an open milestone to this pull request. If your PR targets the next version of
|
||||
TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
|
||||
|
||||
If your PR targets a patch version for previous releases: consider creating a dedicated
|
||||
milestone e.g. v1.5.1 if it does not exist yet.'
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Check Final Status
|
||||
run: |
|
||||
echo "::error::Milestone is closed. This check is failing."
|
||||
exit 1
|
||||
7
Makefile
7
Makefile
@@ -1454,13 +1454,6 @@ bench_integer_aes256_gpu: install_rs_check_toolchain
|
||||
--bench integer-aes256 \
|
||||
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
|
||||
|
||||
.PHONY: bench_integer_trivium_gpu # Run benchmarks for trivium on GPU backend
|
||||
bench_integer_trivium_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-trivium \
|
||||
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
|
||||
|
||||
.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
|
||||
bench_integer_multi_bit: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
|
||||
@@ -79,7 +79,7 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer"] }
|
||||
```
|
||||
|
||||
> [!Note]
|
||||
> Note: You need Rust version 1.91.1 or newer to compile TFHE-rs. You can check your version with `rustc --version`.
|
||||
> Note: You need Rust version 1.84 or newer to compile TFHE-rs. You can check your version with `rustc --version`.
|
||||
|
||||
> [!Note]
|
||||
> Note: AArch64-based machines are not supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
|
||||
|
||||
@@ -1,43 +1,43 @@
|
||||
# Test vectors for TFHE
|
||||
These test vectors are generated using [TFHE-rs](https://github.com/zama-ai/tfhe-rs), with the git tag `tfhe-test-vectors-0.2.0`.
|
||||
|
||||
They are TFHE-rs objects serialized in the [cbor format](https://cbor.io/). These can be deserialized using any cbor library for any programming languages. For example, using the [cbor2](https://pypi.org/project/cbor2/) program, the command to run is: `cbor2 --pretty toy_params/lwe_a.cbor`.
|
||||
They are TFHE-rs objects serialized in the [cbor format](https://cbor.io/). You can deserialize them using any cbor library for the language of your choice. For example, using the [cbor2](https://pypi.org/project/cbor2/) program, run: `cbor2 --pretty toy_params/lwe_a.cbor`.
|
||||
|
||||
There are 2 folders with test vectors for different parameter sets:
|
||||
- `valid_params_128`: valid classical PBS parameters using a Gaussian noise distribution, providing 128-bits of security in the IND-CPA model (i.e., the probability of failure is smaller than 2^{-64}).
|
||||
- `toy_params`: insecure parameters that yield smaller values to simplify the bit comparison of the results.
|
||||
You will find 2 folders with test vectors for different parameter sets:
|
||||
- `valid_params_128`: valid classical PBS parameters using a gaussian noise distribution, providing 128bits of security in the IND-CPA model and a bootstrapping probability of failure of 2^{-64}.
|
||||
- `toy_params`: insecure parameters that yield smaller values
|
||||
|
||||
The values are generated to compute a keyswitch (KS) followed by a bootstrap (PBS). The cleartext inputs are 2 values, A and B defined below.
|
||||
The values are generated for the keyswitch -> bootstrap (KS-PBS) atomic pattern. The cleartext inputs are 2 values, A and B defined below.
|
||||
|
||||
All the random values are generated from a fixed seed, that can be found in the `RAND_SEED` constant below. The PRNG used is the one based on the AES block cipher in counter mode, from tfhe `tfhe-csprng` crate.
|
||||
|
||||
The bootstrap is applied twice, with 2 different lut, the identity lut and a specific one computing the double of the input value (i.e., f(x) = 2*x).
|
||||
The programmable bootstrap is applied twice, with 2 different lut, the identity lut and a specific one (currently a x2 operation)
|
||||
|
||||
## Vectors
|
||||
The following values are generated:
|
||||
|
||||
### Keys
|
||||
| name | description | TFHE-rs type |
|
||||
|------------------------|-----------------------------------------------------------------------------------------|-----------------------------|
|
||||
| `large_lwe_secret_key` | Encryption secret key, before the KS and after the PBS | `LweSecretKey<Vec<u64>>` |
|
||||
| `small_lwe_secret_key` | Secret key encrypting ciphertexts between the KS and the PBS | `LweSecretKey<Vec<u64>>` |
|
||||
| `ksk` | The keyswitching key to convert a ct from the large key to the small one | `LweKeyswitchKey<Vec<u64>>` |
|
||||
| name | description | TFHE-rs type |
|
||||
|------------------------|---------------------------------------------------------------------------------------|-----------------------------|
|
||||
| `large_lwe_secret_key` | Encryption secret key, before the KS and after the PBS | `LweSecretKey<Vec<u64>>` |
|
||||
| `small_lwe_secret_key` | Secret key encrypting ciphertexts between the KS and the PBS | `LweSecretKey<Vec<u64>>` |
|
||||
| `ksk` | The keyswitching key to convert a ct from the large key to the small one | `LweKeyswitchKey<Vec<u64>>` |
|
||||
| `bsk` | the bootstrapping key to perform a programmable bootstrap on the keyswitched ciphertext | `LweBootstrapKey<Vec<u64>>` |
|
||||
|
||||
|
||||
### Ciphertexts
|
||||
| name | description | TFHE-rs type | Cleartext |
|
||||
|----------------------|-----------------------------------------------------------------------------------------------------|----------------------------|----------------------|
|
||||
| `lwe_a` | LWE Ciphertext encrypting A | `LweCiphertext<Vec<u64>>` | `A` |
|
||||
| `lwe_b` | LWE Ciphertext encrypting B | `LweCiphertext<Vec<u64>>` | `B` |
|
||||
| `lwe_sum` | LWE Ciphertext encrypting A plus lwe encryption of B | `LweCiphertext<Vec<u64>>` | `A+B` |
|
||||
| `lwe_prod` | LWE Ciphertext encrypting A times cleartext B | `LweCiphertext<Vec<u64>>` | `A*B` |
|
||||
| `lwe_ms` | LWE Ciphertext encrypting A after a Modulus Switch from q to 2*N ([note](#non-native-encoding)) | `LweCiphertext<Vec<u64>>` | `A` |
|
||||
| `lwe_ks` | LWE Ciphertext encrypting A after a keyswitch from `large_lwe_secret_key` to `small_lwe_secret_key` | `LweCiphertext<Vec<u64>>` | `A` |
|
||||
| `glwe_after_id_br` | GLWE Ciphertext encrypting A after the application of the identity blind rotation on `lwe_ms` | `GlweCiphertext<Vec<u64>>` | rotation of id LUT |
|
||||
| `lwe_after_id_pbs` | LWE Ciphertext encrypting A after the sample extract operation on `glwe_after_id_br` | `LweCiphertext<Vec<u64>>` | `A` |
|
||||
| `glwe_after_spec_br` | GLWE Ciphertext encrypting spec(A) after the application of the spec blind rotation on `lwe_ms` | `GlweCiphertext<Vec<u64>>` | rotation of spec LUT |
|
||||
| `lwe_after_spec_pbs` | LWE Ciphertext encrypting spec(A) after the sample extract operation on `glwe_after_spec_br` | `LweCiphertext<Vec<u64>>` | `spec(A)` |
|
||||
| name | description | TFHE-rs type | Cleartext |
|
||||
|----------------------|--------------------------------------------------------------------------------------------------------------|----------------------------|--------------|
|
||||
| `lwe_a` | Lwe encryption of A | `LweCiphertext<Vec<u64>>` | `A` |
|
||||
| `lwe_b` | Lwe encryption of B | `LweCiphertext<Vec<u64>>` | `B` |
|
||||
| `lwe_sum` | Lwe encryption of A plus lwe encryption of B | `LweCiphertext<Vec<u64>>` | `A+B` |
|
||||
| `lwe_prod` | Lwe encryption of A times cleartext B | `LweCiphertext<Vec<u64>>` | `A*B` |
|
||||
| `lwe_ms` | The lwe ciphertext after the modswitch part of the PBS ([note](#non-native-encoding)) | `LweCiphertext<Vec<u64>>` | `A` |
|
||||
| `lwe_ks` | The lwe ciphertext after the keyswitch | `LweCiphertext<Vec<u64>>` | `A` |
|
||||
| `glwe_after_id_br` | The glwe returned by the application of the identity blind rotation on the mod switched ciphertexts. | `GlweCiphertext<Vec<u64>>` | rot id LUT |
|
||||
| `lwe_after_id_pbs` | The lwe returned by the application of the sample extract operation on the output of the id blind rotation | `LweCiphertext<Vec<u64>>` | `A` |
|
||||
| `glwe_after_spec_br` | The glwe returned by the application of the spec blind rotation on the mod switched ciphertexts. | `GlweCiphertext<Vec<u64>>` | rot spec LUT |
|
||||
| `lwe_after_spec_pbs` | The lwe returned by the application of the sample extract operation on the output of the spec blind rotation | `LweCiphertext<Vec<u64>>` | `spec(A)` |
|
||||
|
||||
Ciphertexts with the `_karatsuba` suffix are generated using the Karatsuba polynomial multiplication algorithm in the blind rotation, while default ciphertexts are generated using an FFT multiplication.
|
||||
This makes it easier to reproduce bit exact results.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tfhe-cuda-backend"
|
||||
version = "0.13.0"
|
||||
version = "0.13.2"
|
||||
edition = "2021"
|
||||
authors = ["Zama team"]
|
||||
license = "BSD-3-Clause-Clear"
|
||||
|
||||
@@ -86,7 +86,6 @@ fn main() {
|
||||
"cuda/include/integer/integer.h",
|
||||
"cuda/include/integer/rerand.h",
|
||||
"cuda/include/aes/aes.h",
|
||||
"cuda/include/trivium/trivium.h",
|
||||
"cuda/include/zk/zk.h",
|
||||
"cuda/include/keyswitch/keyswitch.h",
|
||||
"cuda/include/keyswitch/ks_enums.h",
|
||||
|
||||
@@ -29,13 +29,15 @@ template <typename Torus> struct int_aes_lut_buffers {
|
||||
allocate_gpu_memory, size_tracker);
|
||||
std::function<Torus(Torus, Torus)> and_lambda =
|
||||
[](Torus a, Torus b) -> Torus { return a & b; };
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
|
||||
this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, and_lambda, allocate_gpu_memory);
|
||||
auto active_streams_and_lut = streams.active_gpu_subset(
|
||||
SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
|
||||
params.pbs_type);
|
||||
this->and_lut->generate_and_broadcast_bivariate_lut(
|
||||
active_streams_and_lut, {0}, {and_lambda}, allocate_gpu_memory);
|
||||
|
||||
this->and_lut->broadcast_lut(active_streams_and_lut);
|
||||
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
|
||||
|
||||
this->flush_lut = new int_radix_lut<Torus>(
|
||||
@@ -44,11 +46,14 @@ template <typename Torus> struct int_aes_lut_buffers {
|
||||
std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
|
||||
return x & 1;
|
||||
};
|
||||
|
||||
generate_device_accumulator(
|
||||
streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
|
||||
this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, flush_lambda, allocate_gpu_memory);
|
||||
auto active_streams_flush_lut = streams.active_gpu_subset(
|
||||
AES_STATE_BITS * num_aes_inputs, params.pbs_type);
|
||||
this->flush_lut->generate_and_broadcast_lut(
|
||||
active_streams_flush_lut, {0}, {flush_lambda}, allocate_gpu_memory);
|
||||
this->flush_lut->broadcast_lut(active_streams_flush_lut);
|
||||
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
|
||||
|
||||
this->carry_lut = new int_radix_lut<Torus>(
|
||||
@@ -56,11 +61,14 @@ template <typename Torus> struct int_aes_lut_buffers {
|
||||
std::function<Torus(Torus)> carry_lambda = [](Torus x) -> Torus {
|
||||
return (x >> 1) & 1;
|
||||
};
|
||||
|
||||
generate_device_accumulator(
|
||||
streams.stream(0), streams.gpu_index(0), this->carry_lut->get_lut(0, 0),
|
||||
this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, carry_lambda, allocate_gpu_memory);
|
||||
auto active_streams_carry_lut =
|
||||
streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
|
||||
this->carry_lut->generate_and_broadcast_lut(
|
||||
active_streams_carry_lut, {0}, {carry_lambda}, allocate_gpu_memory);
|
||||
this->carry_lut->broadcast_lut(active_streams_carry_lut);
|
||||
this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
|
||||
}
|
||||
|
||||
|
||||
@@ -10,11 +10,7 @@ extern std::mutex m;
|
||||
extern bool p2p_enabled;
|
||||
extern const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS;
|
||||
extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
||||
|
||||
extern "C" {
|
||||
int32_t cuda_setup_multi_gpu(int device_0_id);
|
||||
}
|
||||
|
||||
extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128;
|
||||
// Define a variant type that can be either a vector or a single pointer
|
||||
template <typename Torus>
|
||||
using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
|
||||
@@ -42,6 +38,8 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
|
||||
|
||||
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
|
||||
PBS_TYPE pbs_type);
|
||||
uint32_t get_active_gpu_count_u128(uint32_t num_inputs, uint32_t gpu_count,
|
||||
PBS_TYPE pbs_type);
|
||||
|
||||
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
|
||||
|
||||
@@ -70,7 +68,7 @@ public:
|
||||
// Construct an empty set. Invalid use of an empty set should raise an error
|
||||
// right away through asserts or because of a nullptr dereference
|
||||
CudaStreams()
|
||||
: _streams(nullptr), _gpu_indexes(nullptr), _gpu_count((uint32_t)-1),
|
||||
: _streams(nullptr), _gpu_indexes(nullptr), _gpu_count(0),
|
||||
_owns_streams(false) {}
|
||||
|
||||
// Returns a subset of this set as an active subset. An active subset is one
|
||||
@@ -80,7 +78,15 @@ public:
|
||||
_streams, _gpu_indexes,
|
||||
get_active_gpu_count(num_radix_blocks, _gpu_count, pbs_type));
|
||||
}
|
||||
|
||||
// Returns a subset of this set as an active subset for pbs128. An active
|
||||
// subset is one that is temporarily used to perform some computation. For
|
||||
// pbs128, the threshold is different, because the original threshold was
|
||||
// designed for 2_2 params.
|
||||
CudaStreams active_gpu_subset_u128(int num_radix_blocks, PBS_TYPE pbs_type) {
|
||||
return CudaStreams(
|
||||
_streams, _gpu_indexes,
|
||||
get_active_gpu_count_u128(num_radix_blocks, _gpu_count, pbs_type));
|
||||
}
|
||||
// Returns a CudaStreams struct containing only the ith stream
|
||||
CudaStreams get_ith(int i) const {
|
||||
return CudaStreams(&_streams[i], &_gpu_indexes[i], 1);
|
||||
@@ -114,11 +120,13 @@ public:
|
||||
// streams on the same GPU
|
||||
void create_on_same_gpus(const CudaStreams &other) {
|
||||
PANIC_IF_FALSE(_streams == nullptr,
|
||||
"Assign clone to non-empty cudastreams");
|
||||
"Cuda error: Assign clone to non-empty CudaStreams");
|
||||
PANIC_IF_FALSE(_gpu_count <= 8,
|
||||
"Cuda error: GPU count should be in the interval [0, 8]");
|
||||
|
||||
cudaStream_t *new_streams = new cudaStream_t[other._gpu_count];
|
||||
|
||||
uint32_t *gpu_indexes_clone = new uint32_t[_gpu_count];
|
||||
uint32_t *gpu_indexes_clone = new uint32_t[other._gpu_count];
|
||||
for (uint32_t i = 0; i < other._gpu_count; ++i) {
|
||||
new_streams[i] = cuda_create_stream(other._gpu_indexes[i]);
|
||||
gpu_indexes_clone[i] = other._gpu_indexes[i];
|
||||
@@ -170,6 +178,7 @@ public:
|
||||
_streams = nullptr;
|
||||
delete[] _gpu_indexes;
|
||||
_gpu_indexes = nullptr;
|
||||
_gpu_count = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -483,4 +492,38 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
// Event pool for managing temporary CUDA events in scatter/gather operations
|
||||
struct CudaEventPool {
|
||||
private:
|
||||
std::vector<cudaEvent_t> _events;
|
||||
std::vector<uint32_t> _gpu_indices;
|
||||
|
||||
public:
|
||||
CudaEventPool() {}
|
||||
|
||||
// Requests a new event from the pool (creates and stores it)
|
||||
cudaEvent_t request_event(uint32_t gpu_index) {
|
||||
cudaEvent_t event = cuda_create_event(gpu_index);
|
||||
_events.push_back(event);
|
||||
_gpu_indices.push_back(gpu_index);
|
||||
return event;
|
||||
}
|
||||
|
||||
// Releases all pooled events
|
||||
// This should always be called in the release of the LUT, so streams
|
||||
// are already synchronized
|
||||
void release() {
|
||||
for (size_t i = 0; i < _events.size(); i++) {
|
||||
cuda_event_destroy(_events[i], _gpu_indices[i]);
|
||||
}
|
||||
_events.clear();
|
||||
_gpu_indices.clear();
|
||||
}
|
||||
|
||||
~CudaEventPool() {
|
||||
GPU_ASSERT(_events.empty(),
|
||||
"CudaEventPool: must call release before destruction");
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -65,8 +65,14 @@ template <typename Torus> struct boolean_bitop_buffer {
|
||||
return x % params.message_modulus;
|
||||
};
|
||||
|
||||
message_extract_lut->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_extract_lut->get_lut(0, 0),
|
||||
message_extract_lut->get_degree(0),
|
||||
message_extract_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_message_extract, gpu_memory_allocated);
|
||||
message_extract_lut->broadcast_lut(active_streams);
|
||||
}
|
||||
tmp_lwe_left = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
@@ -136,8 +142,12 @@ template <typename Torus> struct int_bitop_buffer {
|
||||
}
|
||||
};
|
||||
|
||||
lut->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {lut_bivariate_f}, gpu_memory_allocated);
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
|
||||
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
|
||||
lut->broadcast_lut(active_streams);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@@ -146,8 +156,6 @@ template <typename Torus> struct int_bitop_buffer {
|
||||
num_radix_blocks, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
|
||||
std::vector<std::function<Torus(Torus)>> lut_funcs;
|
||||
std::vector<uint32_t> lut_indices;
|
||||
for (int i = 0; i < params.message_modulus; i++) {
|
||||
auto rhs = i;
|
||||
|
||||
@@ -163,13 +171,14 @@ template <typename Torus> struct int_bitop_buffer {
|
||||
return x ^ rhs;
|
||||
}
|
||||
};
|
||||
|
||||
lut_funcs.push_back(lut_univariate_scalar_f);
|
||||
lut_indices.push_back(i);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
|
||||
lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_univariate_scalar_f,
|
||||
gpu_memory_allocated);
|
||||
lut->broadcast_lut(active_streams);
|
||||
}
|
||||
|
||||
lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -202,11 +211,16 @@ template <typename Torus> struct boolean_bitnot_buffer {
|
||||
return x % message_modulus;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_extract_lut->get_lut(0, 0),
|
||||
message_extract_lut->get_degree(0),
|
||||
message_extract_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_message_extract, gpu_memory_allocated);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
|
||||
|
||||
message_extract_lut->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
|
||||
message_extract_lut->broadcast_lut(active_streams);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -28,17 +28,21 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
|
||||
uint32_t bits_per_block = std::log2(params.message_modulus);
|
||||
uint32_t msg_modulus = params.message_modulus;
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
|
||||
lut->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {[msg_modulus, bits_per_block](Torus x) {
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
|
||||
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
[msg_modulus, bits_per_block](Torus x) {
|
||||
const auto xm = x % msg_modulus;
|
||||
const auto sign_bit = (xm >> (bits_per_block - 1)) & 1;
|
||||
return (Torus)((msg_modulus - 1) * sign_bit);
|
||||
}},
|
||||
},
|
||||
allocate_gpu_memory);
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
this->last_block = new CudaRadixCiphertextFFI;
|
||||
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
|
||||
@@ -85,6 +85,24 @@ template <typename Torus> struct int_cmux_buffer {
|
||||
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 0),
|
||||
predicate_lut->get_degree(0), predicate_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, inverted_lut_f, gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 1),
|
||||
predicate_lut->get_degree(1), predicate_lut->get_max_degree(1),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f, gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_extract_lut->get_lut(0, 0), message_extract_lut->get_degree(0),
|
||||
message_extract_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
message_extract_lut_f, gpu_memory_allocated);
|
||||
Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
|
||||
for (int index = 0; index < 2 * num_radix_blocks; index++) {
|
||||
if (index < num_radix_blocks) {
|
||||
@@ -97,18 +115,12 @@ template <typename Torus> struct int_cmux_buffer {
|
||||
predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
|
||||
auto active_streams_pred =
|
||||
streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
|
||||
predicate_lut->generate_and_broadcast_bivariate_lut(
|
||||
active_streams_pred, {0, 1}, {inverted_lut_f, lut_f},
|
||||
gpu_memory_allocated);
|
||||
|
||||
predicate_lut->broadcast_lut(active_streams_pred);
|
||||
auto active_streams_msg =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
|
||||
message_extract_lut->generate_and_broadcast_lut(
|
||||
active_streams_msg, {0}, {message_extract_lut_f}, gpu_memory_allocated);
|
||||
message_extract_lut->broadcast_lut(active_streams_msg);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
|
||||
@@ -28,7 +28,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
|
||||
Torus total_modulus = params.message_modulus * params.carry_modulus;
|
||||
uint32_t max_value = (total_modulus - 1) / (params.message_modulus - 1);
|
||||
|
||||
int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
|
||||
int max_chunks = CEIL_DIV(num_radix_blocks, max_value);
|
||||
tmp_out = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), tmp_out, num_radix_blocks,
|
||||
@@ -39,21 +39,22 @@ template <typename Torus> struct int_are_all_block_true_buffer {
|
||||
max_chunks, params.big_lwe_dimension, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
preallocated_h_lut = (Torus *)malloc(
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
|
||||
|
||||
is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(max_chunks, params.pbs_type);
|
||||
|
||||
auto is_max_value_f = [max_value](Torus x) -> Torus {
|
||||
return x == max_value;
|
||||
};
|
||||
preallocated_h_lut = (Torus *)malloc(
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), is_max_value->get_lut(0, 0),
|
||||
is_max_value->get_degree(0), is_max_value->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, is_max_value_f, gpu_memory_allocated);
|
||||
|
||||
is_max_value->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {is_max_value_f}, gpu_memory_allocated);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(max_chunks, params.pbs_type);
|
||||
is_max_value->broadcast_lut(active_streams);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
@@ -102,10 +103,15 @@ template <typename Torus> struct int_comparison_eq_buffer {
|
||||
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), is_non_zero_lut->get_lut(0, 0),
|
||||
is_non_zero_lut->get_degree(0), is_non_zero_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
is_non_zero_lut->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {is_non_zero_lut_f}, gpu_memory_allocated);
|
||||
is_non_zero_lut->broadcast_lut(active_streams);
|
||||
|
||||
// Scalar may have up to num_radix_blocks blocks
|
||||
scalar_comparison_luts = new int_radix_lut<Torus>(
|
||||
@@ -123,28 +129,32 @@ template <typename Torus> struct int_comparison_eq_buffer {
|
||||
return (lhs == rhs);
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<std::function<Torus(Torus)>> lut_funcs;
|
||||
std::vector<uint32_t> lut_indices;
|
||||
for (int i = 0; i < total_modulus; i++) {
|
||||
auto lut_f = [i, operator_f](Torus x) -> Torus {
|
||||
return operator_f(i, x);
|
||||
};
|
||||
lut_funcs.push_back(lut_f);
|
||||
lut_indices.push_back(i);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
scalar_comparison_luts->get_lut(0, i),
|
||||
scalar_comparison_luts->get_degree(i),
|
||||
scalar_comparison_luts->get_max_degree(i), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
scalar_comparison_luts->generate_and_broadcast_lut(
|
||||
active_streams, lut_indices, lut_funcs, gpu_memory_allocated);
|
||||
|
||||
scalar_comparison_luts->broadcast_lut(active_streams);
|
||||
if (op == COMPARISON_TYPE::EQ || op == COMPARISON_TYPE::NE) {
|
||||
operator_lut =
|
||||
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
operator_lut->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {operator_f}, gpu_memory_allocated);
|
||||
// operator_lut->broadcast_lut(active_streams);
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
|
||||
operator_lut->get_degree(0), operator_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, operator_f, gpu_memory_allocated);
|
||||
|
||||
operator_lut->broadcast_lut(active_streams);
|
||||
} else {
|
||||
operator_lut = nullptr;
|
||||
}
|
||||
@@ -211,6 +221,9 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
|
||||
streams.stream(0), streams.gpu_index(0), tmp_y, num_radix_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
// LUTs
|
||||
tree_inner_leaf_lut =
|
||||
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
tree_last_leaf_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
|
||||
@@ -221,14 +234,15 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
|
||||
tree_last_leaf_scalar_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
|
||||
|
||||
tree_inner_leaf_lut =
|
||||
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
tree_inner_leaf_lut->get_lut(0, 0), tree_inner_leaf_lut->get_degree(0),
|
||||
tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
block_selector_f, gpu_memory_allocated);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
tree_inner_leaf_lut->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {block_selector_f}, allocate_gpu_memory);
|
||||
tree_inner_leaf_lut->broadcast_lut(active_streams);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
@@ -412,8 +426,12 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
identity_lut->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {identity_lut_f}, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
|
||||
identity_lut->get_degree(0), identity_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, identity_lut_f, gpu_memory_allocated);
|
||||
identity_lut->broadcast_lut(active_streams);
|
||||
|
||||
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
|
||||
auto is_zero_f = [total_modulus](Torus x) -> Torus {
|
||||
@@ -423,8 +441,13 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
is_zero_lut = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
is_zero_lut->generate_and_broadcast_lut(active_streams, {0}, {is_zero_f},
|
||||
gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), is_zero_lut->get_lut(0, 0),
|
||||
is_zero_lut->get_degree(0), is_zero_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, is_zero_f, gpu_memory_allocated);
|
||||
|
||||
is_zero_lut->broadcast_lut(active_streams);
|
||||
|
||||
switch (op) {
|
||||
case COMPARISON_TYPE::MAX:
|
||||
@@ -499,9 +522,13 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
PANIC("Cuda error: sign_lut creation failed due to wrong function.")
|
||||
};
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), signed_lut->get_lut(0, 0),
|
||||
signed_lut->get_degree(0), signed_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, signed_lut_f, gpu_memory_allocated);
|
||||
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
signed_lut->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {signed_lut_f}, gpu_memory_allocated);
|
||||
signed_lut->broadcast_lut(active_streams);
|
||||
}
|
||||
preallocated_h_lut = (Torus *)malloc(
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
|
||||
|
||||
@@ -11,16 +11,26 @@ template <typename Torus> struct int_compression {
|
||||
Torus *tmp_glwe_array_out;
|
||||
bool gpu_memory_allocated;
|
||||
uint32_t lwe_per_glwe;
|
||||
uint32_t max_num_glwes;
|
||||
|
||||
// num_radix_blocks: total number of LWE ciphertexts (radix blocks) to
|
||||
// compress lwe_per_glwe: max LWEs packed per GLWE (= polynomial_size),
|
||||
// defined by the chosen parameter set
|
||||
int_compression(CudaStreams streams, int_radix_params compression_params,
|
||||
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->compression_params = compression_params;
|
||||
this->lwe_per_glwe = lwe_per_glwe;
|
||||
|
||||
uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
|
||||
// Calculate the actual number of GLWEs needed based on total radix blocks.
|
||||
// This ensures we allocate enough memory when num_radix_blocks >
|
||||
// lwe_per_glwe.
|
||||
max_num_glwes = CEIL_DIV(num_radix_blocks, lwe_per_glwe);
|
||||
|
||||
tmp_lwe = static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
|
||||
num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
@@ -28,7 +38,7 @@ template <typename Torus> struct int_compression {
|
||||
allocate_gpu_memory));
|
||||
tmp_glwe_array_out =
|
||||
static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
|
||||
lwe_per_glwe * glwe_accumulator_size * sizeof(Torus),
|
||||
max_num_glwes * glwe_accumulator_size * sizeof(Torus),
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory));
|
||||
|
||||
|
||||
@@ -283,9 +283,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
|
||||
zero_out_if_not_1_lut_2};
|
||||
size_t lut_gpu_indexes[2] = {0, 3};
|
||||
for (int j = 0; j < 2; j++) {
|
||||
luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
|
||||
{0}, {zero_out_if_not_1_lut_f},
|
||||
gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(lut_gpu_indexes[j]),
|
||||
streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
|
||||
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, zero_out_if_not_1_lut_f, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
luts[0] = zero_out_if_not_2_lut_1;
|
||||
@@ -293,9 +296,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
|
||||
lut_gpu_indexes[0] = 1;
|
||||
lut_gpu_indexes[1] = 2;
|
||||
for (int j = 0; j < 2; j++) {
|
||||
luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
|
||||
{0}, {zero_out_if_not_2_lut_f},
|
||||
gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(lut_gpu_indexes[j]),
|
||||
streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
|
||||
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, zero_out_if_not_2_lut_f, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
quotient_lut_1 =
|
||||
@@ -315,12 +321,21 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
|
||||
};
|
||||
auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };
|
||||
|
||||
quotient_lut_1->generate_and_broadcast_lut(
|
||||
streams.get_ith(2), {0}, {quotient_lut_1_f}, gpu_memory_allocated);
|
||||
quotient_lut_2->generate_and_broadcast_lut(
|
||||
streams.get_ith(1), {0}, {quotient_lut_2_f}, gpu_memory_allocated);
|
||||
quotient_lut_3->generate_and_broadcast_lut(
|
||||
streams.get_ith(0), {0}, {quotient_lut_3_f}, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(2), streams.gpu_index(2), quotient_lut_1->get_lut(0, 0),
|
||||
quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(1), streams.gpu_index(1), quotient_lut_2->get_lut(0, 0),
|
||||
quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), quotient_lut_3->get_lut(0, 0),
|
||||
quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);
|
||||
|
||||
message_extract_lut_1 = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
|
||||
@@ -335,12 +350,15 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
|
||||
luts[0] = message_extract_lut_1;
|
||||
luts[1] = message_extract_lut_2;
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
|
||||
for (int j = 0; j < 2; j++) {
|
||||
luts[j]->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
|
||||
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
luts[j]->broadcast_lut(active_streams);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -989,14 +1007,24 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
masking_luts_2[i] = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
masking_luts_1[i]->get_lut(0, 0), masking_luts_1[i]->get_degree(0),
|
||||
masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_masking, gpu_memory_allocated);
|
||||
auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
|
||||
masking_luts_1[i]->generate_and_broadcast_lut(
|
||||
active_streams_1, {0}, {lut_f_masking}, gpu_memory_allocated);
|
||||
masking_luts_1[i]->broadcast_lut(active_streams_1);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
masking_luts_2[i]->get_lut(0, 0), masking_luts_2[i]->get_degree(0),
|
||||
masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_masking, gpu_memory_allocated);
|
||||
auto active_streams_2 =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
masking_luts_2[i]->generate_and_broadcast_lut(
|
||||
active_streams_2, {0}, {lut_f_masking}, gpu_memory_allocated);
|
||||
masking_luts_2[i]->broadcast_lut(active_streams_2);
|
||||
}
|
||||
|
||||
// create and generate message_extract_lut_1 and message_extract_lut_2
|
||||
@@ -1014,12 +1042,15 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
|
||||
int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
|
||||
message_extract_lut_2};
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
for (int j = 0; j < 2; j++) {
|
||||
luts[j]->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
|
||||
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
|
||||
luts[j]->broadcast_lut(active_streams);
|
||||
}
|
||||
|
||||
// Give name to closures to improve readability
|
||||
@@ -1110,8 +1141,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
merge_overflow_flags_luts[i] = new int_radix_lut<Torus>(
|
||||
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
|
||||
|
||||
merge_overflow_flags_luts[i]->generate_and_broadcast_bivariate_lut(
|
||||
active_gpu_count_for_bits, {0}, {lut_f_bit}, gpu_memory_allocated);
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
merge_overflow_flags_luts[i]->get_lut(0, 0),
|
||||
merge_overflow_flags_luts[i]->get_degree(0),
|
||||
merge_overflow_flags_luts[i]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f_bit, gpu_memory_allocated);
|
||||
merge_overflow_flags_luts[i]->broadcast_lut(active_gpu_count_for_bits);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1520,12 +1557,16 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
compare_signed_bits_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
compare_signed_bits_lut->get_lut(0, 0),
|
||||
compare_signed_bits_lut->get_degree(0),
|
||||
compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
f_compare_extracted_signed_bits, gpu_memory_allocated);
|
||||
auto active_gpu_count_cmp =
|
||||
streams.active_gpu_subset(1, params.pbs_type); // only 1 block needed
|
||||
|
||||
compare_signed_bits_lut->generate_and_broadcast_bivariate_lut(
|
||||
active_gpu_count_cmp, {0}, {f_compare_extracted_signed_bits},
|
||||
gpu_memory_allocated);
|
||||
compare_signed_bits_lut->broadcast_lut(active_gpu_count_cmp);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -53,8 +53,13 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
|
||||
return count;
|
||||
};
|
||||
|
||||
univ_lut_mem->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {generate_uni_lut_lambda}, allocate_gpu_memory);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), univ_lut_mem->get_lut(0, 0),
|
||||
univ_lut_mem->get_degree(0), univ_lut_mem->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
|
||||
|
||||
univ_lut_mem->broadcast_lut(active_streams);
|
||||
|
||||
auto generate_bi_lut_lambda =
|
||||
[num_bits](Torus block_num_bit_count,
|
||||
@@ -65,8 +70,13 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
|
||||
return 0;
|
||||
};
|
||||
|
||||
biv_lut_mem->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {generate_bi_lut_lambda}, allocate_gpu_memory);
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), biv_lut_mem->get_lut(0, 0),
|
||||
biv_lut_mem->get_degree(0), biv_lut_mem->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
|
||||
|
||||
biv_lut_mem->broadcast_lut(active_streams);
|
||||
|
||||
this->tmp_ct = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
@@ -222,7 +232,7 @@ template <typename Torus> struct int_ilog2_buffer {
|
||||
this->sum_output_not_propagated, counter_num_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
lut_message_not =
|
||||
this->lut_message_not =
|
||||
new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
std::function<Torus(Torus)> lut_message_lambda =
|
||||
@@ -230,11 +240,16 @@ template <typename Torus> struct int_ilog2_buffer {
|
||||
uint64_t message = x % this->params.message_modulus;
|
||||
return (~message) % this->params.message_modulus;
|
||||
};
|
||||
|
||||
generate_device_accumulator(streams.stream(0), streams.gpu_index(0),
|
||||
this->lut_message_not->get_lut(0, 0),
|
||||
this->lut_message_not->get_degree(0),
|
||||
this->lut_message_not->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size,
|
||||
params.message_modulus, params.carry_modulus,
|
||||
lut_message_lambda, allocate_gpu_memory);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
|
||||
lut_message_not->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {lut_message_lambda}, allocate_gpu_memory);
|
||||
lut_message_not->broadcast_lut(active_streams);
|
||||
|
||||
this->lut_carry_not =
|
||||
new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
|
||||
@@ -244,8 +259,13 @@ template <typename Torus> struct int_ilog2_buffer {
|
||||
uint64_t carry = x / this->params.message_modulus;
|
||||
return (~carry) % this->params.message_modulus;
|
||||
};
|
||||
lut_carry_not->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {lut_carry_lambda}, allocate_gpu_memory);
|
||||
generate_device_accumulator(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->lut_carry_not->get_lut(0, 0), this->lut_carry_not->get_degree(0),
|
||||
this->lut_carry_not->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_carry_lambda, allocate_gpu_memory);
|
||||
lut_carry_not->broadcast_lut(active_streams);
|
||||
|
||||
this->message_blocks_not = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
#include "utils/helper_multi_gpu.cuh"
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <queue>
|
||||
|
||||
#include <stdio.h>
|
||||
@@ -35,8 +34,10 @@ public:
|
||||
} else if ((msg_mod) == 4 && (carry_mod) == 4) { \
|
||||
constexpr int max_noise_level = 5; \
|
||||
if ((noise_level_expr) > max_noise_level) \
|
||||
PANIC("Cuda error: noise exceeds maximum authorized value for 2_2 " \
|
||||
"parameters"); \
|
||||
PANIC( \
|
||||
"Cuda error: noise %d exceeds maximum authorized value 5 for 2_2" \
|
||||
" parameters", \
|
||||
noise_level_expr); \
|
||||
} else if ((msg_mod) == 8 && (carry_mod) == 8) { \
|
||||
constexpr int max_noise_level = 9; \
|
||||
if ((noise_level_expr) > max_noise_level) \
|
||||
@@ -350,6 +351,7 @@ struct int_radix_lut_custom_input_output {
|
||||
|
||||
CudaStreamsBarrier multi_gpu_scatter_barrier, multi_gpu_broadcast_barrier;
|
||||
CudaStreamsBarrier multi_gpu_gather_barrier;
|
||||
CudaEventPool event_pool;
|
||||
|
||||
// Setup the LUT configuration:
|
||||
// input_big_lwe_dimension: BIG LWE dimension of the KS output / PBS input
|
||||
@@ -372,8 +374,13 @@ struct int_radix_lut_custom_input_output {
|
||||
this->num_input_blocks = num_input_blocks;
|
||||
this->gpu_memory_allocated = allocate_gpu_memory;
|
||||
|
||||
this->active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
if (sizeof(OutputTorus) == 16) {
|
||||
this->active_streams =
|
||||
streams.active_gpu_subset_u128(num_radix_blocks, params.pbs_type);
|
||||
} else {
|
||||
this->active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
}
|
||||
}
|
||||
|
||||
void setup_degrees() {
|
||||
@@ -385,9 +392,13 @@ struct int_radix_lut_custom_input_output {
|
||||
void allocate_pbs_buffers(int_radix_params params, uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
|
||||
int classical_threshold =
|
||||
sizeof(OutputTorus) == 16
|
||||
? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
|
||||
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
||||
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
|
||||
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
|
||||
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
||||
: classical_threshold;
|
||||
|
||||
for (uint i = 0; i < active_streams.count(); i++) {
|
||||
cuda_set_device(active_streams.gpu_index(i));
|
||||
@@ -459,11 +470,14 @@ struct int_radix_lut_custom_input_output {
|
||||
lwe_trivial_indexes, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void setup_gemm_batch_ks_temp_buffers(uint64_t &size_tracker) {
|
||||
int classical_threshold =
|
||||
sizeof(OutputTorus) == 16
|
||||
? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
|
||||
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
||||
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
|
||||
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
|
||||
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
||||
: classical_threshold;
|
||||
|
||||
auto inputs_on_gpu = std::min(
|
||||
(int)num_input_blocks,
|
||||
@@ -810,10 +824,13 @@ struct int_radix_lut_custom_input_output {
|
||||
void allocate_lwe_vector_for_non_trivial_indexes(
|
||||
CudaStreams streams, uint64_t max_num_radix_blocks,
|
||||
uint64_t &size_tracker, bool allocate_gpu_memory) {
|
||||
|
||||
int classical_threshold =
|
||||
sizeof(OutputTorus) == 16
|
||||
? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
|
||||
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
||||
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
|
||||
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
|
||||
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
||||
: classical_threshold;
|
||||
|
||||
// We need to create the auxiliary array only in GPU 0
|
||||
if (active_streams.count() > 1) {
|
||||
@@ -836,56 +853,6 @@ struct int_radix_lut_custom_input_output {
|
||||
}
|
||||
}
|
||||
|
||||
void generate_and_broadcast_lut(
|
||||
const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
|
||||
std::vector<std::function<OutputTorus(OutputTorus)>> f,
|
||||
bool gpu_memory_allocated) {
|
||||
// streams should be a subset of active_streams
|
||||
|
||||
for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
|
||||
generate_device_accumulator<OutputTorus>(
|
||||
streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
|
||||
get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, f[i], gpu_memory_allocated);
|
||||
}
|
||||
//broadcast_lut(streams);
|
||||
}
|
||||
|
||||
void generate_and_broadcast_bivariate_lut(
|
||||
const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
|
||||
std::vector<std::function<OutputTorus(OutputTorus, OutputTorus)>> f,
|
||||
bool gpu_memory_allocated) {
|
||||
// streams should be a subset of active_streams
|
||||
|
||||
/* for (int fidx = 0; fidx < f.size(); ++fidx) {
|
||||
__int128_t f_hash = 0;
|
||||
uint32_t bits_per_lut_val = 5;
|
||||
uint32_t input_modulus_sup =
|
||||
params.message_modulus * params.carry_modulus;
|
||||
for (uint32_t i = 0; i < input_modulus_sup; ++i) {
|
||||
OutputTorus f_eval =
|
||||
f[fidx](i / params.message_modulus, i % params.message_modulus);
|
||||
GPU_ASSERT(f_eval < (1 << bits_per_lut_val),
|
||||
"LUT value expected bitwidth overflow");
|
||||
f_hash |= f_eval;
|
||||
f_hash <<= bits_per_lut_val;
|
||||
}
|
||||
printf("%016llX%016llX\n",
|
||||
(unsigned long long)((f_hash >> 64) & 0xFFFFFFFFFFFFFFFF),
|
||||
(unsigned long long)(f_hash & 0xFFFFFFFFFFFFFFFF));
|
||||
}
|
||||
*/
|
||||
for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
|
||||
generate_device_accumulator_bivariate<InputTorus>(
|
||||
streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
|
||||
get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, f[i], gpu_memory_allocated);
|
||||
}
|
||||
//broadcast_lut(streams);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
PANIC_IF_FALSE(lut_indexes_vec.size() == lut_vec.size(),
|
||||
"Lut vec and Lut vec indexes must have the same size");
|
||||
@@ -916,6 +883,7 @@ struct int_radix_lut_custom_input_output {
|
||||
|
||||
if (active_streams.count() > 1) {
|
||||
active_streams.synchronize();
|
||||
event_pool.release();
|
||||
multi_gpu_gather_barrier.release();
|
||||
multi_gpu_broadcast_barrier.release();
|
||||
multi_gpu_scatter_barrier.release();
|
||||
@@ -1036,15 +1004,18 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
bits_per_block * num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
std::vector<std::function<Torus(Torus)>> lut_funs;
|
||||
std::vector<uint32_t> lut_indices;
|
||||
for (int i = 0; i < bits_per_block; i++) {
|
||||
|
||||
auto operator_f = [i, final_offset](Torus x) -> Torus {
|
||||
Torus y = (x >> i) & 1;
|
||||
return y << final_offset;
|
||||
};
|
||||
lut_funs.push_back(operator_f);
|
||||
lut_indices.push_back(i);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
|
||||
lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
operator_f, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1063,10 +1034,7 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(
|
||||
bits_per_block * num_radix_blocks, params.pbs_type);
|
||||
|
||||
lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funs,
|
||||
gpu_memory_allocated);
|
||||
// lut->broadcast_lut(active_streams);
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
/**
|
||||
* the input indexes should take the first bits_per_block PBS to target
|
||||
@@ -1142,6 +1110,24 @@ template <typename Torus> struct int_fullprop_buffer {
|
||||
};
|
||||
|
||||
//
|
||||
Torus *lut_buffer_message = lut->get_lut(0, 0);
|
||||
uint64_t *message_degree = lut->get_degree(0);
|
||||
uint64_t *message_max_degree = lut->get_max_degree(0);
|
||||
Torus *lut_buffer_carry = lut->get_lut(0, 1);
|
||||
uint64_t *carry_degree = lut->get_degree(1);
|
||||
uint64_t *carry_max_degree = lut->get_max_degree(1);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut_buffer_message,
|
||||
message_degree, message_max_degree, params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_message, gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut_buffer_carry, carry_degree,
|
||||
carry_max_degree, params.glwe_dimension, params.polynomial_size,
|
||||
params.message_modulus, params.carry_modulus, lut_f_carry,
|
||||
gpu_memory_allocated);
|
||||
|
||||
uint64_t lwe_indexes_size = 2 * sizeof(Torus);
|
||||
Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
|
||||
@@ -1151,15 +1137,9 @@ template <typename Torus> struct int_fullprop_buffer {
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lwe_indexes, h_lwe_indexes, lwe_indexes_size, streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
|
||||
//
|
||||
// No broadcast is needed because full prop is done on 1 single GPU.
|
||||
// By passing a single-GPU CudaStreams with streams.get_ith(0) the LUT is
|
||||
// not broadcast.
|
||||
//
|
||||
lut->generate_and_broadcast_lut(streams.get_ith(0), {0, 1},
|
||||
{lut_f_message, lut_f_carry},
|
||||
gpu_memory_allocated);
|
||||
|
||||
tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
@@ -1277,10 +1257,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
if (total_ciphertexts > 0 ||
|
||||
reduce_degrees_for_single_carry_propagation) {
|
||||
uint64_t size_tracker = 0;
|
||||
allocated_luts_message_carry = true;
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, params, 2, pbs_count, true, size_tracker);
|
||||
|
||||
allocated_luts_message_carry = true;
|
||||
uint64_t message_modulus_bits =
|
||||
(uint64_t)std::log2(params.message_modulus);
|
||||
uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
|
||||
@@ -1296,9 +1275,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
streams, upper_bound_num_blocks, size_tracker, true);
|
||||
}
|
||||
}
|
||||
|
||||
if (allocated_luts_message_carry) {
|
||||
|
||||
auto message_acc = luts_message_carry->get_lut(0, 0);
|
||||
auto carry_acc = luts_message_carry->get_lut(0, 1);
|
||||
|
||||
@@ -1310,11 +1287,22 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
return x / message_modulus;
|
||||
};
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), message_acc,
|
||||
luts_message_carry->get_degree(0),
|
||||
luts_message_carry->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, params.carry_modulus,
|
||||
lut_f_message, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), carry_acc,
|
||||
luts_message_carry->get_degree(1),
|
||||
luts_message_carry->get_max_degree(1), params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, params.carry_modulus,
|
||||
lut_f_carry, gpu_memory_allocated);
|
||||
auto active_gpu_count_mc =
|
||||
streams.active_gpu_subset(pbs_count, params.pbs_type);
|
||||
luts_message_carry->generate_and_broadcast_lut(
|
||||
active_gpu_count_mc, {0, 1}, {lut_f_message, lut_f_carry},
|
||||
gpu_memory_allocated);
|
||||
luts_message_carry->broadcast_lut(active_gpu_count_mc);
|
||||
}
|
||||
}
|
||||
int_sum_ciphertexts_vec_memory(
|
||||
@@ -1449,6 +1437,10 @@ template <typename Torus> struct int_seq_group_prop_memory {
|
||||
uint32_t group_size, uint32_t big_lwe_size_bytes,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
grouping_size = group_size;
|
||||
group_resolved_carries = new CudaRadixCiphertextFFI;
|
||||
@@ -1458,20 +1450,22 @@ template <typename Torus> struct int_seq_group_prop_memory {
|
||||
allocate_gpu_memory);
|
||||
|
||||
int num_seq_luts = grouping_size - 1;
|
||||
Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
|
||||
lut_sequential_algorithm =
|
||||
new int_radix_lut<Torus>(streams, params, num_seq_luts, num_seq_luts,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
std::vector<std::function<Torus(Torus)>> lut_funcs;
|
||||
std::vector<uint32_t> lut_indices;
|
||||
Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
|
||||
|
||||
for (int index = 0; index < num_seq_luts; index++) {
|
||||
auto f_lut_sequential = [index](Torus propa_cum_sum_block) {
|
||||
return (propa_cum_sum_block >> (index + 1)) & 1;
|
||||
};
|
||||
lut_funcs.push_back(f_lut_sequential);
|
||||
auto seq_lut = lut_sequential_algorithm->get_lut(0, index);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), seq_lut,
|
||||
lut_sequential_algorithm->get_degree(index),
|
||||
lut_sequential_algorithm->get_max_degree(index), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_lut_sequential,
|
||||
gpu_memory_allocated);
|
||||
h_seq_lut_indexes[index] = index;
|
||||
lut_indices.push_back(index);
|
||||
}
|
||||
Torus *seq_lut_indexes = lut_sequential_algorithm->get_lut_indexes(0, 0);
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
@@ -1479,12 +1473,9 @@ template <typename Torus> struct int_seq_group_prop_memory {
|
||||
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_seq_luts, params.pbs_type);
|
||||
lut_sequential_algorithm->generate_and_broadcast_lut(
|
||||
active_streams, lut_indices, lut_funcs, gpu_memory_allocated);
|
||||
// lut_sequential_algorithm->broadcast_lut(active_streams);
|
||||
lut_sequential_algorithm->broadcast_lut(active_streams);
|
||||
free(h_seq_lut_indexes);
|
||||
}
|
||||
|
||||
};
|
||||
void release(CudaStreams streams) {
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
group_resolved_carries,
|
||||
@@ -1506,6 +1497,10 @@ template <typename Torus> struct int_hs_group_prop_memory {
|
||||
uint32_t num_groups, uint32_t big_lwe_size_bytes,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
auto f_lut_hillis_steele = [](Torus msb, Torus lsb) -> Torus {
|
||||
if (msb == 2) {
|
||||
@@ -1525,11 +1520,16 @@ template <typename Torus> struct int_hs_group_prop_memory {
|
||||
lut_hillis_steele = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_groups, allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_hillis_steele->get_lut(0, 0), lut_hillis_steele->get_degree(0),
|
||||
lut_hillis_steele->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f_lut_hillis_steele,
|
||||
gpu_memory_allocated);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_groups, params.pbs_type);
|
||||
lut_hillis_steele->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {f_lut_hillis_steele}, gpu_memory_allocated);
|
||||
}
|
||||
lut_hillis_steele->broadcast_lut(active_streams);
|
||||
};
|
||||
void release(CudaStreams streams) {
|
||||
|
||||
lut_hillis_steele->release(streams);
|
||||
@@ -1819,6 +1819,112 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
num_extra_luts = 1;
|
||||
}
|
||||
|
||||
uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
|
||||
luts_array_second_step = new int_radix_lut<Torus>(
|
||||
streams, params, num_luts_second_step, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
// luts for first group inner propagation
|
||||
for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
|
||||
auto f_first_grouping_inner_propagation =
|
||||
[lut_id](Torus propa_cum_sum_block) -> Torus {
|
||||
uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
|
||||
|
||||
if (carry != 0) {
|
||||
return 2ull; // Generates Carry
|
||||
} else {
|
||||
return 0ull; // Does not generate carry
|
||||
}
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
luts_array_second_step->get_lut(0, lut_id),
|
||||
luts_array_second_step->get_degree(lut_id),
|
||||
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
f_first_grouping_inner_propagation, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
auto f_first_grouping_outer_propagation =
|
||||
[num_bits_in_block](Torus block) -> Torus {
|
||||
return (block >> (num_bits_in_block - 1)) & 1;
|
||||
};
|
||||
|
||||
int lut_id = grouping_size - 1;
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
luts_array_second_step->get_lut(0, lut_id),
|
||||
luts_array_second_step->get_degree(lut_id),
|
||||
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
f_first_grouping_outer_propagation, gpu_memory_allocated);
|
||||
|
||||
// for other groupings inner propagation
|
||||
for (int index = 0; index < grouping_size; index++) {
|
||||
uint32_t lut_id = index + grouping_size;
|
||||
|
||||
auto f_other_groupings_inner_propagation =
|
||||
[index](Torus propa_cum_sum_block) -> Torus {
|
||||
uint64_t mask = (2 << index) - 1;
|
||||
if (propa_cum_sum_block >= (2 << index)) {
|
||||
return 2ull; // Generates
|
||||
} else if ((propa_cum_sum_block & mask) == mask) {
|
||||
return 1ull; // Propagate
|
||||
} else {
|
||||
return 0ull; // Nothing
|
||||
}
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
luts_array_second_step->get_lut(0, lut_id),
|
||||
luts_array_second_step->get_degree(lut_id),
|
||||
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
f_other_groupings_inner_propagation, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
if (use_sequential_algorithm_to_resolve_group_carries) {
|
||||
for (int index = 0; index < grouping_size - 1; index++) {
|
||||
uint32_t lut_id = index + 2 * grouping_size;
|
||||
|
||||
auto f_group_propagation = [index, block_modulus,
|
||||
num_bits_in_block](Torus block) -> Torus {
|
||||
if (block == (block_modulus - 1)) {
|
||||
return 0ull;
|
||||
} else {
|
||||
return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
|
||||
}
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
luts_array_second_step->get_lut(0, lut_id),
|
||||
luts_array_second_step->get_degree(lut_id),
|
||||
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
f_group_propagation, gpu_memory_allocated);
|
||||
}
|
||||
} else {
|
||||
uint32_t lut_id = 2 * grouping_size;
|
||||
auto f_group_propagation = [block_modulus](Torus block) {
|
||||
if (block == (block_modulus - 1)) {
|
||||
return 2ull;
|
||||
} else {
|
||||
return UINT64_MAX % (block_modulus * 2ull);
|
||||
}
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
luts_array_second_step->get_lut(0, lut_id),
|
||||
luts_array_second_step->get_degree(lut_id),
|
||||
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_group_propagation,
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
|
||||
Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);
|
||||
|
||||
for (int index = 0; index < num_radix_blocks; index++) {
|
||||
@@ -1854,11 +1960,6 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
|
||||
luts_array_second_step = new int_radix_lut<Torus>(
|
||||
streams, params, num_luts_second_step, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
// copy the indexes to the gpu
|
||||
Torus *second_lut_indexes = luts_array_second_step->get_lut_indexes(0, 0);
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
@@ -1869,92 +1970,9 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
scalar_array_cum_sum, h_scalar_array_cum_sum,
|
||||
num_radix_blocks * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
|
||||
std::vector<std::function<Torus(Torus)>> lut_funcs;
|
||||
std::vector<uint32_t> lut_ids;
|
||||
|
||||
// luts for first group inner propagation
|
||||
for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
|
||||
auto f_first_grouping_inner_propagation =
|
||||
[lut_id](Torus propa_cum_sum_block) -> Torus {
|
||||
uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
|
||||
|
||||
if (carry != 0) {
|
||||
return 2ull; // Generates Carry
|
||||
} else {
|
||||
return 0ull; // Does not generate carry
|
||||
}
|
||||
};
|
||||
lut_funcs.push_back(f_first_grouping_inner_propagation);
|
||||
lut_ids.push_back(lut_id);
|
||||
}
|
||||
|
||||
auto f_first_grouping_outer_propagation =
|
||||
[num_bits_in_block](Torus block) -> Torus {
|
||||
return (block >> (num_bits_in_block - 1)) & 1;
|
||||
};
|
||||
|
||||
int lut_id = grouping_size - 1;
|
||||
|
||||
lut_funcs.push_back(f_first_grouping_outer_propagation);
|
||||
lut_ids.push_back(lut_id);
|
||||
|
||||
// for other groupings inner propagation
|
||||
for (int index = 0; index < grouping_size; index++) {
|
||||
uint32_t lut_id = index + grouping_size;
|
||||
|
||||
auto f_other_groupings_inner_propagation =
|
||||
[index](Torus propa_cum_sum_block) -> Torus {
|
||||
uint64_t mask = (2 << index) - 1;
|
||||
if (propa_cum_sum_block >= (2 << index)) {
|
||||
return 2ull; // Generates
|
||||
} else if ((propa_cum_sum_block & mask) == mask) {
|
||||
return 1ull; // Propagate
|
||||
} else {
|
||||
return 0ull; // Nothing
|
||||
}
|
||||
};
|
||||
|
||||
lut_funcs.push_back(f_other_groupings_inner_propagation);
|
||||
lut_ids.push_back(lut_id);
|
||||
}
|
||||
|
||||
if (use_sequential_algorithm_to_resolve_group_carries) {
|
||||
for (int index = 0; index < grouping_size - 1; index++) {
|
||||
uint32_t lut_id = index + 2 * grouping_size;
|
||||
|
||||
auto f_group_propagation = [index, block_modulus,
|
||||
num_bits_in_block](Torus block) -> Torus {
|
||||
if (block == (block_modulus - 1)) {
|
||||
return 0ull;
|
||||
} else {
|
||||
return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
|
||||
}
|
||||
};
|
||||
|
||||
lut_funcs.push_back(f_group_propagation);
|
||||
lut_ids.push_back(lut_id);
|
||||
}
|
||||
} else {
|
||||
uint32_t lut_id = 2 * grouping_size;
|
||||
auto f_group_propagation = [block_modulus](Torus block) {
|
||||
if (block == (block_modulus - 1)) {
|
||||
return 2ull;
|
||||
} else {
|
||||
return UINT64_MAX % (block_modulus * 2ull);
|
||||
}
|
||||
};
|
||||
|
||||
lut_funcs.push_back(f_group_propagation);
|
||||
lut_ids.push_back(lut_id);
|
||||
}
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
luts_array_second_step->generate_and_broadcast_lut(
|
||||
active_streams, lut_ids, lut_funcs, gpu_memory_allocated);
|
||||
|
||||
// luts_array_second_step->broadcast_lut(active_streams);
|
||||
luts_array_second_step->broadcast_lut(active_streams);
|
||||
|
||||
if (use_sequential_algorithm_to_resolve_group_carries) {
|
||||
|
||||
@@ -2042,28 +2060,12 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
uint32_t requested_flag;
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
void setup_message_extract_indices_for_carry_async(CudaStreams streams,
|
||||
uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
|
||||
for (int index = 0; index < num_radix_blocks + 1; index++) {
|
||||
if (index < num_radix_blocks) {
|
||||
h_lut_indexes[index] = 0;
|
||||
} else {
|
||||
h_lut_indexes[index] = 1;
|
||||
}
|
||||
}
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
(num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
}
|
||||
|
||||
int_sc_prop_memory(CudaStreams streams, int_radix_params params,
|
||||
uint32_t num_radix_blocks, uint32_t requested_flag_in,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->params = params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
@@ -2072,7 +2074,7 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
uint32_t block_modulus = message_modulus * carry_modulus;
|
||||
uint32_t num_bits_in_block = std::log2(block_modulus);
|
||||
uint32_t grouping_size = num_bits_in_block;
|
||||
num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;
|
||||
num_groups = CEIL_DIV(num_radix_blocks, grouping_size);
|
||||
|
||||
num_many_lut = 2; // many luts apply 2 luts
|
||||
uint32_t box_size = polynomial_size / block_modulus;
|
||||
@@ -2086,6 +2088,24 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
streams, params, num_radix_blocks, grouping_size, num_groups,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
// Step 3 elements
|
||||
int num_luts_message_extract =
|
||||
requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
|
||||
lut_message_extract = new int_radix_lut<Torus>(
|
||||
streams, params, num_luts_message_extract, num_radix_blocks + 1,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
// lut for the first block in the first grouping
|
||||
auto f_message_extract = [message_modulus](Torus block) -> Torus {
|
||||
return (block >> 1) % message_modulus;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
|
||||
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f_message_extract,
|
||||
gpu_memory_allocated);
|
||||
|
||||
// This store a single block that with be used to store the overflow or
|
||||
// carry results
|
||||
output_flag = new CudaRadixCiphertextFFI;
|
||||
@@ -2136,30 +2156,22 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
return output1 << 3 | output2 << 2;
|
||||
};
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_overflow_flag_prep->get_lut(0, 0),
|
||||
lut_overflow_flag_prep->get_degree(0),
|
||||
lut_overflow_flag_prep->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
|
||||
gpu_memory_allocated);
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
lut_overflow_flag_prep->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {f_overflow_fp}, gpu_memory_allocated);
|
||||
lut_overflow_flag_prep->broadcast_lut(active_streams);
|
||||
}
|
||||
|
||||
// Step 3 elements
|
||||
int num_luts_message_extract =
|
||||
requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
|
||||
lut_message_extract = new int_radix_lut<Torus>(
|
||||
streams, params, num_luts_message_extract, num_radix_blocks + 1,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
// lut for the first block in the first grouping
|
||||
auto f_message_extract = [message_modulus](Torus block) -> Torus {
|
||||
return (block >> 1) % message_modulus;
|
||||
};
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
|
||||
|
||||
// For the final cleanup in case of overflow or carry (it seems that I can)
|
||||
// It seems that this lut could be apply together with the other one but for
|
||||
// now we won't do it
|
||||
switch (requested_flag) {
|
||||
case outputFlag::FLAG_OVERFLOW: { // Overflow case
|
||||
if (requested_flag == outputFlag::FLAG_OVERFLOW) { // Overflow case
|
||||
auto f_overflow_last = [num_radix_blocks,
|
||||
requested_flag_in](Torus block) -> Torus {
|
||||
uint32_t position = (num_radix_blocks == 1 &&
|
||||
@@ -2171,38 +2183,62 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
Torus does_overflow_if_carry_is_0 = (block >> 2) & 1;
|
||||
if (input_carry == outputFlag::FLAG_OVERFLOW) {
|
||||
return does_overflow_if_carry_is_1;
|
||||
} else {
|
||||
return does_overflow_if_carry_is_0;
|
||||
}
|
||||
return does_overflow_if_carry_is_0;
|
||||
};
|
||||
setup_message_extract_indices_for_carry_async(streams, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
|
||||
lut_message_extract->generate_and_broadcast_lut(
|
||||
active_streams, {0, 1}, {f_message_extract, f_overflow_last},
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_message_extract->get_lut(0, 1),
|
||||
lut_message_extract->get_degree(1),
|
||||
lut_message_extract->get_max_degree(1), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_overflow_last,
|
||||
gpu_memory_allocated);
|
||||
break;
|
||||
}
|
||||
case outputFlag::FLAG_CARRY: { // Carry case
|
||||
|
||||
setup_message_extract_indices_for_carry_async(streams, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
|
||||
for (int index = 0; index < num_radix_blocks + 1; index++) {
|
||||
if (index < num_radix_blocks) {
|
||||
h_lut_indexes[index] = 0;
|
||||
} else {
|
||||
h_lut_indexes[index] = 1;
|
||||
}
|
||||
}
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
(num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
}
|
||||
if (requested_flag == outputFlag::FLAG_CARRY) { // Carry case
|
||||
|
||||
auto f_carry_last = [](Torus block) -> Torus {
|
||||
return ((block >> 2) & 1);
|
||||
};
|
||||
|
||||
lut_message_extract->generate_and_broadcast_lut(
|
||||
active_streams, {0, 1}, {f_message_extract, f_carry_last},
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_message_extract->get_lut(0, 1),
|
||||
lut_message_extract->get_degree(1),
|
||||
lut_message_extract->get_max_degree(1), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_carry_last,
|
||||
gpu_memory_allocated);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
lut_message_extract->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {f_message_extract}, gpu_memory_allocated);
|
||||
break;
|
||||
}
|
||||
|
||||
// lut_message_extract->broadcast_lut(active_streams);
|
||||
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
|
||||
for (int index = 0; index < num_radix_blocks + 1; index++) {
|
||||
if (index < num_radix_blocks) {
|
||||
h_lut_indexes[index] = 0;
|
||||
} else {
|
||||
h_lut_indexes[index] = 1;
|
||||
}
|
||||
}
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
(num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
}
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
|
||||
lut_message_extract->broadcast_lut(active_streams);
|
||||
};
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
@@ -2472,7 +2508,7 @@ template <typename Torus> struct int_borrow_prop_memory {
|
||||
uint32_t num_bits_in_block = std::log2(block_modulus);
|
||||
uint32_t grouping_size = num_bits_in_block;
|
||||
group_size = grouping_size;
|
||||
num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;
|
||||
num_groups = CEIL_DIV(num_radix_blocks, grouping_size);
|
||||
|
||||
num_many_lut = 2; // many luts apply 2 luts
|
||||
uint32_t box_size = polynomial_size / block_modulus;
|
||||
@@ -2500,11 +2536,16 @@ template <typename Torus> struct int_borrow_prop_memory {
|
||||
return (block >> 1) % message_modulus;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
|
||||
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f_message_extract,
|
||||
gpu_memory_allocated);
|
||||
active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
|
||||
lut_message_extract->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {f_message_extract}, gpu_memory_allocated);
|
||||
lut_message_extract->broadcast_lut(active_streams);
|
||||
|
||||
if (compute_overflow) {
|
||||
lut_borrow_flag =
|
||||
@@ -2515,8 +2556,12 @@ template <typename Torus> struct int_borrow_prop_memory {
|
||||
return ((block >> 2) & 1);
|
||||
};
|
||||
|
||||
lut_borrow_flag->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {f_borrow_flag}, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_borrow_flag->get_lut(0, 0), lut_borrow_flag->get_degree(0),
|
||||
lut_borrow_flag->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f_borrow_flag, gpu_memory_allocated);
|
||||
lut_borrow_flag->broadcast_lut(active_streams);
|
||||
}
|
||||
|
||||
active_streams =
|
||||
@@ -2537,7 +2582,9 @@ template <typename Torus> struct int_borrow_prop_memory {
|
||||
void release(CudaStreams streams) {
|
||||
|
||||
shifted_blocks_borrow_state_mem->release(streams);
|
||||
delete shifted_blocks_borrow_state_mem;
|
||||
prop_simu_group_carries_mem->release(streams);
|
||||
delete prop_simu_group_carries_mem;
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
overflow_block, gpu_memory_allocated);
|
||||
|
||||
|
||||
@@ -37,14 +37,17 @@ template <typename Torus> struct int_mul_memory {
|
||||
zero_out_predicate_lut =
|
||||
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
zero_out_predicate_lut->get_lut(0, 0),
|
||||
zero_out_predicate_lut->get_degree(0),
|
||||
zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
zero_out_predicate_lut_f, gpu_memory_allocated);
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
zero_out_predicate_lut->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {zero_out_predicate_lut_f},
|
||||
gpu_memory_allocated);
|
||||
|
||||
// zero_out_predicate_lut->broadcast_lut(active_streams);
|
||||
zero_out_predicate_lut->broadcast_lut(active_streams);
|
||||
|
||||
zero_out_mem = new int_zero_out_if_buffer<Torus>(
|
||||
streams, params, num_radix_blocks, allocate_gpu_memory, size_tracker);
|
||||
@@ -52,7 +55,10 @@ template <typename Torus> struct int_mul_memory {
|
||||
return;
|
||||
}
|
||||
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
// 'vector_result_lsb' contains blocks from all possible shifts of
|
||||
// radix_lwe_left excluding zero ciphertext blocks
|
||||
@@ -96,6 +102,18 @@ template <typename Torus> struct int_mul_memory {
|
||||
return (x * y) / message_modulus;
|
||||
};
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lsb_acc,
|
||||
luts_array->get_degree(0), luts_array->get_max_degree(0),
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
lut_f_lsb, gpu_memory_allocated);
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), msb_acc,
|
||||
luts_array->get_degree(1), luts_array->get_max_degree(1),
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
lut_f_msb, gpu_memory_allocated);
|
||||
|
||||
// lut_indexes_vec for luts_array should be reinitialized
|
||||
// first lsb_vector_block_count value should reference to lsb_acc
|
||||
// last msb_vector_block_count values should reference to msb_acc
|
||||
@@ -105,12 +123,9 @@ template <typename Torus> struct int_mul_memory {
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
|
||||
msb_vector_block_count);
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(total_block_count, params.pbs_type);
|
||||
luts_array->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0, 1}, {lut_f_lsb, lut_f_msb}, gpu_memory_allocated);
|
||||
|
||||
luts_array->broadcast_lut(active_streams);
|
||||
// create memory object for sum ciphertexts
|
||||
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
streams, params, num_radix_blocks, 2 * num_radix_blocks,
|
||||
|
||||
@@ -22,8 +22,7 @@ template <typename Torus> struct int_grouped_oprf_memory {
|
||||
uint32_t calculated_active_blocks =
|
||||
total_random_bits == 0
|
||||
? 0
|
||||
: (total_random_bits + message_bits_per_block - 1) /
|
||||
message_bits_per_block;
|
||||
: CEIL_DIV(total_random_bits, message_bits_per_block);
|
||||
if (num_blocks_to_process != calculated_active_blocks) {
|
||||
PANIC(
|
||||
"num_blocks_to_process should be equal to calculated_active_blocks");
|
||||
@@ -170,8 +169,7 @@ template <typename Torus> struct int_grouped_oprf_custom_range_memory {
|
||||
this->allocate_gpu_memory = allocate_gpu_memory;
|
||||
|
||||
this->num_random_input_blocks =
|
||||
(num_input_random_bits + message_bits_per_block - 1) /
|
||||
message_bits_per_block;
|
||||
CEIL_DIV(num_input_random_bits, message_bits_per_block);
|
||||
|
||||
this->grouped_oprf_memory = new int_grouped_oprf_memory<Torus>(
|
||||
streams, params, this->num_random_input_blocks, message_bits_per_block,
|
||||
|
||||
@@ -85,11 +85,15 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
|
||||
}
|
||||
|
||||
// right shift
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
|
||||
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
shift_lut_f, gpu_memory_allocated);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);
|
||||
cur_lut_bivariate->broadcast_lut(active_streams);
|
||||
|
||||
lut_buffers_bivariate.push_back(cur_lut_bivariate);
|
||||
}
|
||||
@@ -168,10 +172,16 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
|
||||
}
|
||||
|
||||
// right shift
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
|
||||
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
shift_lut_f, gpu_memory_allocated);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);
|
||||
cur_lut_bivariate->broadcast_lut(active_streams);
|
||||
|
||||
lut_buffers_bivariate.push_back(cur_lut_bivariate);
|
||||
}
|
||||
}
|
||||
@@ -261,11 +271,16 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
return shifted | padding;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
shift_last_block_lut_univariate->get_lut(0, 0),
|
||||
shift_last_block_lut_univariate->get_degree(0),
|
||||
shift_last_block_lut_univariate->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
|
||||
auto active_streams_shift_last =
|
||||
streams.active_gpu_subset(1, params.pbs_type);
|
||||
shift_last_block_lut_univariate->generate_and_broadcast_lut(
|
||||
active_streams_shift_last, {0}, {last_block_lut_f},
|
||||
gpu_memory_allocated);
|
||||
shift_last_block_lut_univariate->broadcast_lut(active_streams_shift_last);
|
||||
|
||||
lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
|
||||
}
|
||||
@@ -283,8 +298,15 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
return (params.message_modulus - 1) * x_sign_bit;
|
||||
};
|
||||
|
||||
padding_block_lut_univariate->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {padding_block_lut_f}, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
padding_block_lut_univariate->get_lut(0, 0),
|
||||
padding_block_lut_univariate->get_degree(0),
|
||||
padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
padding_block_lut_f, gpu_memory_allocated);
|
||||
// auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
padding_block_lut_univariate->broadcast_lut(active_streams);
|
||||
|
||||
lut_buffers_univariate.push_back(padding_block_lut_univariate);
|
||||
|
||||
@@ -317,11 +339,16 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
return message_of_current_block + carry_of_previous_block;
|
||||
};
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
shift_blocks_lut_bivariate->get_lut(0, 0),
|
||||
shift_blocks_lut_bivariate->get_degree(0),
|
||||
shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
blocks_lut_f, gpu_memory_allocated);
|
||||
auto active_streams_shift_blocks =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
shift_blocks_lut_bivariate->generate_and_broadcast_bivariate_lut(
|
||||
active_streams_shift_blocks, {0}, {blocks_lut_f},
|
||||
gpu_memory_allocated);
|
||||
shift_blocks_lut_bivariate->broadcast_lut(active_streams_shift_blocks);
|
||||
|
||||
lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
|
||||
}
|
||||
|
||||
@@ -113,21 +113,27 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
|
||||
else
|
||||
return current_bit;
|
||||
};
|
||||
;
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), mux_lut->get_lut(0, 0),
|
||||
mux_lut->get_degree(0), mux_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, mux_lut_f, gpu_memory_allocated);
|
||||
auto active_gpu_count_mux = streams.active_gpu_subset(
|
||||
bits_per_block * num_radix_blocks, params.pbs_type);
|
||||
|
||||
mux_lut->generate_and_broadcast_lut(active_gpu_count_mux, {0}, {mux_lut_f},
|
||||
gpu_memory_allocated);
|
||||
mux_lut->broadcast_lut(active_gpu_count_mux);
|
||||
|
||||
auto cleaning_lut_f = [params](Torus x) -> Torus {
|
||||
return x % params.message_modulus;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), cleaning_lut->get_lut(0, 0),
|
||||
cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
|
||||
auto active_gpu_count_cleaning =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
cleaning_lut->generate_and_broadcast_lut(
|
||||
active_gpu_count_cleaning, {0}, {cleaning_lut_f}, gpu_memory_allocated);
|
||||
cleaning_lut->broadcast_lut(active_gpu_count_cleaning);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
|
||||
@@ -74,26 +74,45 @@ template <typename Torus> struct int_overflowing_sub_memory {
|
||||
luts_array, size_tracker,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
auto lut_does_block_generate_carry = luts_array->get_lut(0, 0);
|
||||
auto lut_does_block_generate_or_propagate = luts_array->get_lut(0, 1);
|
||||
|
||||
// generate luts (aka accumulators)
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut_does_block_generate_carry,
|
||||
luts_array->get_degree(0), luts_array->get_max_degree(0),
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
f_lut_does_block_generate_carry, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_does_block_generate_or_propagate, luts_array->get_degree(1),
|
||||
luts_array->get_max_degree(1), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f_lut_does_block_generate_or_propagate,
|
||||
gpu_memory_allocated);
|
||||
if (allocate_gpu_memory)
|
||||
cuda_set_value_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
luts_array->get_lut_indexes(0, 1), 1,
|
||||
num_radix_blocks - 1);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
luts_borrow_propagation_sum->get_lut(0, 0),
|
||||
luts_borrow_propagation_sum->get_degree(0),
|
||||
luts_borrow_propagation_sum->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
f_luts_borrow_propagation_sum, gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), message_acc->get_lut(0, 0),
|
||||
message_acc->get_degree(0), message_acc->get_max_degree(0),
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
f_message_acc, gpu_memory_allocated);
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
luts_borrow_propagation_sum->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {f_luts_borrow_propagation_sum},
|
||||
gpu_memory_allocated);
|
||||
|
||||
luts_array->generate_and_broadcast_lut(
|
||||
active_streams, {0, 1},
|
||||
{f_lut_does_block_generate_carry,
|
||||
f_lut_does_block_generate_or_propagate},
|
||||
gpu_memory_allocated);
|
||||
// generate luts (aka accumulators)
|
||||
|
||||
message_acc->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {f_message_acc}, gpu_memory_allocated);
|
||||
luts_array->broadcast_lut(active_streams);
|
||||
luts_borrow_propagation_sum->broadcast_lut(active_streams);
|
||||
message_acc->broadcast_lut(active_streams);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
|
||||
@@ -7,7 +7,8 @@
|
||||
#include <functional>
|
||||
#include <vector>
|
||||
|
||||
const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 10;
|
||||
// If we use more than 5 streams the result is incorrect
|
||||
const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 5;
|
||||
|
||||
template <typename Torus> struct int_equality_selectors_buffer {
|
||||
int_radix_params params;
|
||||
@@ -175,8 +176,7 @@ template <typename Torus> struct int_possible_results_buffer {
|
||||
this->lut_stride =
|
||||
(ciphertext_modulus / this->max_luts_per_call) * box_size;
|
||||
|
||||
this->num_lut_accumulators =
|
||||
(total_luts_needed + max_luts_per_call - 1) / max_luts_per_call;
|
||||
this->num_lut_accumulators = CEIL_DIV(total_luts_needed, max_luts_per_call);
|
||||
|
||||
stream_luts =
|
||||
new int_radix_lut<Torus> *[num_streams * num_lut_accumulators];
|
||||
@@ -298,10 +298,14 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
|
||||
int_radix_lut<Torus> *lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
|
||||
|
||||
lut->generate_and_broadcast_lut(
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {id_fn},
|
||||
allocate_gpu_memory);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
|
||||
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
id_fn, allocate_gpu_memory);
|
||||
|
||||
lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type));
|
||||
this->stream_identity_luts[i] = lut;
|
||||
}
|
||||
|
||||
@@ -314,17 +318,27 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
|
||||
|
||||
this->message_extract_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
|
||||
|
||||
this->message_extract_lut->generate_and_broadcast_lut(
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {msg_fn},
|
||||
allocate_gpu_memory);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->message_extract_lut->get_lut(0, 0),
|
||||
this->message_extract_lut->get_degree(0),
|
||||
this->message_extract_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
msg_fn, allocate_gpu_memory);
|
||||
this->message_extract_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type));
|
||||
|
||||
this->carry_extract_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
|
||||
|
||||
this->carry_extract_lut->generate_and_broadcast_lut(
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {carry_fn},
|
||||
allocate_gpu_memory);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->carry_extract_lut->get_lut(0, 0),
|
||||
this->carry_extract_lut->get_degree(0),
|
||||
this->carry_extract_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
carry_fn, allocate_gpu_memory);
|
||||
this->carry_extract_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type));
|
||||
|
||||
this->partial_aggregated_vectors =
|
||||
new CudaRadixCiphertextFFI *[num_streams];
|
||||
@@ -1171,9 +1185,15 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
|
||||
this->prefix_sum_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
|
||||
|
||||
this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
|
||||
{prefix_sum_fn}, allocate_gpu_memory);
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->prefix_sum_lut->get_lut(0, 0),
|
||||
this->prefix_sum_lut->get_degree(0),
|
||||
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
prefix_sum_fn, allocate_gpu_memory);
|
||||
this->prefix_sum_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type));
|
||||
|
||||
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
|
||||
Torus val = x % params.message_modulus;
|
||||
@@ -1183,9 +1203,14 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
|
||||
};
|
||||
this->cleanup_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
|
||||
this->cleanup_lut->generate_and_broadcast_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
|
||||
{cleanup_fn}, allocate_gpu_memory);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
|
||||
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
cleanup_fn, allocate_gpu_memory);
|
||||
this->cleanup_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type));
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
@@ -1351,9 +1376,15 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
|
||||
this->prefix_sum_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
|
||||
|
||||
this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
|
||||
{prefix_sum_fn}, allocate_gpu_memory);
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->prefix_sum_lut->get_lut(0, 0),
|
||||
this->prefix_sum_lut->get_degree(0),
|
||||
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
prefix_sum_fn, allocate_gpu_memory);
|
||||
this->prefix_sum_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type));
|
||||
|
||||
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
|
||||
Torus val = x % params.message_modulus;
|
||||
@@ -1363,9 +1394,14 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
|
||||
};
|
||||
this->cleanup_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
|
||||
this->cleanup_lut->generate_and_broadcast_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
|
||||
{cleanup_fn}, allocate_gpu_memory);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
|
||||
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
cleanup_fn, allocate_gpu_memory);
|
||||
this->cleanup_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type));
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
|
||||
@@ -73,9 +73,10 @@ void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
|
||||
int8_t **fp_ks_buffer,
|
||||
bool gpu_memory_allocated);
|
||||
|
||||
void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
|
||||
void const *input, void *output,
|
||||
uint32_t base_log, uint32_t level_count);
|
||||
void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
|
||||
void const *input, void *output,
|
||||
uint32_t base_log,
|
||||
uint32_t level_count);
|
||||
}
|
||||
|
||||
#endif // CNCRT_KS_H_
|
||||
|
||||
@@ -1,24 +0,0 @@
|
||||
#ifndef TRIVIUM_H
|
||||
#define TRIVIUM_H
|
||||
|
||||
#include "../integer/integer.h"
|
||||
|
||||
extern "C" {
|
||||
uint64_t scratch_cuda_trivium_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);
|
||||
|
||||
void cuda_trivium_generate_keystream_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
|
||||
const CudaRadixCiphertextFFI *key, const CudaRadixCiphertextFFI *iv,
|
||||
uint32_t num_inputs, uint32_t num_steps, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_trivium_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,295 +0,0 @@
|
||||
#ifndef TRIVIUM_UTILITIES_H
|
||||
#define TRIVIUM_UTILITIES_H
|
||||
#include "../integer/integer_utilities.h"
|
||||
|
||||
/// Struct to hold the LUTs.
|
||||
template <typename Torus> struct int_trivium_lut_buffers {
|
||||
// Bivariate AND Gate LUT:
|
||||
// AND operation: f(a, b) = (a & 1) & (b & 1).
|
||||
// This is a Bivariate PBS used for the non-linear parts of Trivium.
|
||||
int_radix_lut<Torus> *and_lut;
|
||||
|
||||
// Univariate Identity LUT:
|
||||
// MESSAGE EXTRACTION operation: f(x) = x & 1.
|
||||
// This is a Univariate PBS used to "flush" the state: it resets the noise
|
||||
// after additions and ensures the message stays within the binary message
|
||||
// space.
|
||||
int_radix_lut<Torus> *flush_lut;
|
||||
|
||||
int_trivium_lut_buffers(CudaStreams streams, const int_radix_params ¶ms,
|
||||
bool allocate_gpu_memory, uint32_t num_trivium_inputs,
|
||||
uint64_t &size_tracker) {
|
||||
|
||||
constexpr uint32_t BATCH_SIZE = 64;
|
||||
constexpr uint32_t MAX_AND_PER_STEP = 3;
|
||||
uint32_t total_lut_ops = num_trivium_inputs * BATCH_SIZE * MAX_AND_PER_STEP;
|
||||
|
||||
this->and_lut = new int_radix_lut<Torus>(streams, params, 1, total_lut_ops,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
std::function<Torus(Torus, Torus)> and_lambda =
|
||||
[](Torus a, Torus b) -> Torus { return (a & 1) & (b & 1); };
|
||||
|
||||
auto active_streams_and =
|
||||
streams.active_gpu_subset(total_lut_ops, params.pbs_type);
|
||||
this->and_lut->generate_and_broadcast_bivariate_lut(
|
||||
active_streams_and, {0}, {and_lambda}, allocate_gpu_memory);
|
||||
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
|
||||
|
||||
uint32_t total_flush_ops = num_trivium_inputs * BATCH_SIZE * 4;
|
||||
|
||||
this->flush_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, total_flush_ops, allocate_gpu_memory, size_tracker);
|
||||
|
||||
std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
|
||||
return x & 1;
|
||||
};
|
||||
|
||||
auto active_streams_flush =
|
||||
streams.active_gpu_subset(total_flush_ops, params.pbs_type);
|
||||
this->flush_lut->generate_and_broadcast_lut(
|
||||
active_streams_flush, {0}, {flush_lambda}, allocate_gpu_memory);
|
||||
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
this->and_lut->release(streams);
|
||||
delete this->and_lut;
|
||||
this->and_lut = nullptr;
|
||||
|
||||
this->flush_lut->release(streams);
|
||||
delete this->flush_lut;
|
||||
this->flush_lut = nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
/// Struct to hold the state and temporary workspaces required for
|
||||
/// Trivium execution on the GPU.
|
||||
///
|
||||
/// This struct manages the memory for the internal registers (A, B, C),
|
||||
/// temporary buffers used during the update function, and buffers used for
|
||||
/// packing data before and after PBS.
|
||||
template <typename Torus> struct int_trivium_state_workspaces {
|
||||
// Trivium Internal State Registers:
|
||||
// Register A: 93 bits
|
||||
CudaRadixCiphertextFFI *a_reg;
|
||||
// Register B: 84 bits
|
||||
CudaRadixCiphertextFFI *b_reg;
|
||||
// Register C: 111 bits
|
||||
CudaRadixCiphertextFFI *c_reg;
|
||||
|
||||
// Shift Workspace:
|
||||
// Used to manage bitshifting operations on the registers
|
||||
CudaRadixCiphertextFFI *shift_workspace;
|
||||
|
||||
// Temporary Update Buffers:
|
||||
// Intermediate buffers for the trivium update logic (t1, t2, t3)
|
||||
CudaRadixCiphertextFFI *temp_t1;
|
||||
CudaRadixCiphertextFFI *temp_t2;
|
||||
CudaRadixCiphertextFFI *temp_t3;
|
||||
|
||||
// Buffers to hold the new values for the registers after an update step
|
||||
CudaRadixCiphertextFFI *new_a;
|
||||
CudaRadixCiphertextFFI *new_b;
|
||||
CudaRadixCiphertextFFI *new_c;
|
||||
|
||||
// PBS Packing Buffers:
|
||||
// Buffers for packing inputs into the bivariate lookup table (AND gate)
|
||||
CudaRadixCiphertextFFI *packed_pbs_lhs;
|
||||
CudaRadixCiphertextFFI *packed_pbs_rhs;
|
||||
// Buffer for the output of the bivariate PBS
|
||||
CudaRadixCiphertextFFI *packed_pbs_out;
|
||||
|
||||
// Flush/Cleanup Packing Buffers:
|
||||
// Buffers for the "flush" LUT which cleans up noise after additions
|
||||
CudaRadixCiphertextFFI *packed_flush_in;
|
||||
CudaRadixCiphertextFFI *packed_flush_out;
|
||||
|
||||
int_trivium_state_workspaces(CudaStreams streams,
|
||||
const int_radix_params ¶ms,
|
||||
bool allocate_gpu_memory, uint32_t num_inputs,
|
||||
uint64_t &size_tracker) {
|
||||
|
||||
this->a_reg = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->a_reg, 93 * num_inputs,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
this->b_reg = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->b_reg, 84 * num_inputs,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
this->c_reg = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->c_reg, 111 * num_inputs,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
this->shift_workspace = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->shift_workspace,
|
||||
128 * num_inputs, params.big_lwe_dimension, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
uint32_t batch_blocks = 64 * num_inputs;
|
||||
|
||||
this->temp_t1 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->temp_t1, batch_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
this->temp_t2 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->temp_t2, batch_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
this->temp_t3 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->temp_t3, batch_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
this->new_a = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->new_a, batch_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
this->new_b = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->new_b, batch_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
this->new_c = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->new_c, batch_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
this->packed_pbs_lhs = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->packed_pbs_lhs,
|
||||
3 * batch_blocks, params.big_lwe_dimension, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
this->packed_pbs_rhs = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->packed_pbs_rhs,
|
||||
3 * batch_blocks, params.big_lwe_dimension, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
this->packed_pbs_out = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->packed_pbs_out,
|
||||
3 * batch_blocks, params.big_lwe_dimension, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
this->packed_flush_in = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->packed_flush_in,
|
||||
4 * batch_blocks, params.big_lwe_dimension, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
this->packed_flush_out = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->packed_flush_out,
|
||||
4 * batch_blocks, params.big_lwe_dimension, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams, bool allocate_gpu_memory) {
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->a_reg, allocate_gpu_memory);
|
||||
delete this->a_reg;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->b_reg, allocate_gpu_memory);
|
||||
delete this->b_reg;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->c_reg, allocate_gpu_memory);
|
||||
delete this->c_reg;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->shift_workspace, allocate_gpu_memory);
|
||||
delete this->shift_workspace;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->temp_t1, allocate_gpu_memory);
|
||||
delete this->temp_t1;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->temp_t2, allocate_gpu_memory);
|
||||
delete this->temp_t2;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->temp_t3, allocate_gpu_memory);
|
||||
delete this->temp_t3;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->new_a, allocate_gpu_memory);
|
||||
delete this->new_a;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->new_b, allocate_gpu_memory);
|
||||
delete this->new_b;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->new_c, allocate_gpu_memory);
|
||||
delete this->new_c;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->packed_pbs_lhs, allocate_gpu_memory);
|
||||
delete this->packed_pbs_lhs;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->packed_pbs_rhs, allocate_gpu_memory);
|
||||
delete this->packed_pbs_rhs;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->packed_pbs_out, allocate_gpu_memory);
|
||||
delete this->packed_pbs_out;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->packed_flush_in, allocate_gpu_memory);
|
||||
delete this->packed_flush_in;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->packed_flush_out, allocate_gpu_memory);
|
||||
delete this->packed_flush_out;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_trivium_buffer {
|
||||
int_radix_params params;
|
||||
bool allocate_gpu_memory;
|
||||
uint32_t num_inputs;
|
||||
|
||||
int_trivium_lut_buffers<Torus> *luts;
|
||||
int_trivium_state_workspaces<Torus> *state;
|
||||
|
||||
int_trivium_buffer(CudaStreams streams, const int_radix_params ¶ms,
|
||||
bool allocate_gpu_memory, uint32_t num_inputs,
|
||||
uint64_t &size_tracker) {
|
||||
this->params = params;
|
||||
this->allocate_gpu_memory = allocate_gpu_memory;
|
||||
this->num_inputs = num_inputs;
|
||||
|
||||
this->luts = new int_trivium_lut_buffers<Torus>(
|
||||
streams, params, allocate_gpu_memory, num_inputs, size_tracker);
|
||||
|
||||
this->state = new int_trivium_state_workspaces<Torus>(
|
||||
streams, params, allocate_gpu_memory, num_inputs, size_tracker);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
luts->release(streams);
|
||||
delete luts;
|
||||
luts = nullptr;
|
||||
|
||||
state->release(streams, allocate_gpu_memory);
|
||||
delete state;
|
||||
state = nullptr;
|
||||
|
||||
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -14,10 +14,10 @@ uint64_t scratch_cuda_expand_without_verification_64(
|
||||
uint32_t casting_output_dimension, uint32_t casting_ks_level,
|
||||
uint32_t casting_ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
|
||||
const bool *is_boolean_array, uint32_t num_compact_lists,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
KS_TYPE casting_key_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
const bool *is_boolean_array, const uint32_t is_boolean_array_len,
|
||||
uint32_t num_compact_lists, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_expand_without_verification_64(
|
||||
CudaStreamsFFI streams, void *lwe_array_out,
|
||||
|
||||
@@ -118,7 +118,8 @@ template <typename Torus> struct zk_expand_mem {
|
||||
zk_expand_mem(CudaStreams streams, int_radix_params computing_params,
|
||||
int_radix_params casting_params, KS_TYPE casting_key_type,
|
||||
const uint32_t *num_lwes_per_compact_list,
|
||||
const bool *is_boolean_array, uint32_t num_compact_lists,
|
||||
const bool *is_boolean_array,
|
||||
const uint32_t is_boolean_array_len, uint32_t num_compact_lists,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker)
|
||||
: computing_params(computing_params), casting_params(casting_params),
|
||||
num_compact_lists(num_compact_lists),
|
||||
@@ -174,6 +175,40 @@ template <typename Torus> struct zk_expand_mem {
|
||||
message_and_carry_extract_luts = new int_radix_lut<Torus>(
|
||||
streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_and_carry_extract_luts->get_lut(0, 0),
|
||||
message_and_carry_extract_luts->get_degree(0),
|
||||
message_and_carry_extract_luts->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, message_extract_lut_f, gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_and_carry_extract_luts->get_lut(0, 1),
|
||||
message_and_carry_extract_luts->get_degree(1),
|
||||
message_and_carry_extract_luts->get_max_degree(1),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, carry_extract_lut_f, gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_and_carry_extract_luts->get_lut(0, 2),
|
||||
message_and_carry_extract_luts->get_degree(2),
|
||||
message_and_carry_extract_luts->get_max_degree(2),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, message_extract_and_sanitize_bool_lut_f,
|
||||
gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_and_carry_extract_luts->get_lut(0, 3),
|
||||
message_and_carry_extract_luts->get_degree(3),
|
||||
message_and_carry_extract_luts->get_max_degree(3),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, carry_extract_and_sanitize_bool_lut_f,
|
||||
gpu_memory_allocated);
|
||||
|
||||
// We are always packing two LWEs. We just need to be sure we have enough
|
||||
// space in the carry part to store a message of the same size as is in the
|
||||
// message part.
|
||||
@@ -236,14 +271,36 @@ template <typename Torus> struct zk_expand_mem {
|
||||
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
|
||||
auto lwe_index = i + num_packed_msgs * offset;
|
||||
auto lwe_index_in_list = i % num_lwes_in_kth;
|
||||
PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
|
||||
"Cuda error: index %d is beyond the max value %d",
|
||||
lwe_index, num_packed_msgs * num_lwes);
|
||||
h_indexes_in[lwe_index] = lwe_index_in_list + offset;
|
||||
h_indexes_out[lwe_index] =
|
||||
num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
|
||||
// If the input relates to a boolean, shift the LUT so the correct one
|
||||
// with sanitization is used
|
||||
PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
|
||||
"Cuda error: index %d is beyond the max value %d",
|
||||
h_indexes_in[lwe_index], num_packed_msgs * num_lwes);
|
||||
PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
|
||||
"Cuda error: index %d is beyond the max value %d",
|
||||
h_indexes_out[lwe_index], num_packed_msgs * num_lwes);
|
||||
// is_boolean_array tells us which input is a boolean and thus the
|
||||
// related output needs boolean sanitization. It naturally has
|
||||
// total_blocks entries, but h_indexes_out reaches
|
||||
// message_modulus * ceil(total_blocks/2) - 1. When total_blocks is odd,
|
||||
// the ceiling causes out-of-bounds access. Reading garbage "true" would
|
||||
// set h_lut_indexes to an invalid index pointing to uninitialized
|
||||
// memory instead of a real LUT. Rust pads is_boolean_array with FALSE
|
||||
// to match.
|
||||
PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
|
||||
"Cuda error: index %d for is_boolean_array is out of "
|
||||
"bounds (len is %d)",
|
||||
h_indexes_out[lwe_index], is_boolean_array_len);
|
||||
auto boolean_offset =
|
||||
is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
|
||||
h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
|
||||
PANIC_IF_FALSE(
|
||||
h_lut_indexes[lwe_index] < 4,
|
||||
"Cuda error: lut index is greater than the max possible value (3)");
|
||||
}
|
||||
offset += num_lwes_in_kth;
|
||||
}
|
||||
@@ -258,13 +315,7 @@ template <typename Torus> struct zk_expand_mem {
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
|
||||
|
||||
message_and_carry_extract_luts->generate_and_broadcast_lut(
|
||||
active_streams, {0, 1, 2, 3},
|
||||
{message_extract_lut_f, carry_extract_lut_f,
|
||||
message_extract_and_sanitize_bool_lut_f,
|
||||
carry_extract_and_sanitize_bool_lut_f},
|
||||
gpu_memory_allocated);
|
||||
message_and_carry_extract_luts->broadcast_lut(active_streams);
|
||||
|
||||
message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
|
||||
active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
|
||||
|
||||
@@ -183,9 +183,10 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_128(
|
||||
base_log, level_count, num_lwes);
|
||||
}
|
||||
|
||||
void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
|
||||
void const *input, void *output,
|
||||
uint32_t base_log, uint32_t level_count) {
|
||||
void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
|
||||
void const *input, void *output,
|
||||
uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
host_cuda_closest_representable(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<const uint64_t *>(input),
|
||||
static_cast<uint64_t *>(output), base_log,
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "torus.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
@@ -351,6 +350,7 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
Torus state =
|
||||
init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
|
||||
uint32_t offset = i * level_count * (lwe_dimension_out + 1);
|
||||
#pragma unroll 1
|
||||
for (int j = 0; j < level_count; j++) {
|
||||
|
||||
KSTorus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
|
||||
@@ -363,16 +363,15 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
lwe_acc_out[shmem_index] = local_lwe_out;
|
||||
}
|
||||
|
||||
if (tid <= lwe_dimension_out) {
|
||||
for (int offset = blockDim.y / 2; offset > 0 && threadIdx.y < offset;
|
||||
offset /= 2) {
|
||||
__syncthreads();
|
||||
for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
|
||||
__syncthreads();
|
||||
if (tid <= lwe_dimension_out && threadIdx.y < offset) {
|
||||
lwe_acc_out[shmem_index] +=
|
||||
lwe_acc_out[shmem_index + offset * blockDim.x];
|
||||
}
|
||||
if (threadIdx.y == 0)
|
||||
block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
|
||||
}
|
||||
if (tid <= lwe_dimension_out && threadIdx.y == 0)
|
||||
block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
|
||||
}
|
||||
|
||||
template <typename Torus, typename KSTorus>
|
||||
|
||||
@@ -12,12 +12,9 @@
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "torus.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
|
||||
|
||||
// Finish the keyswitching operation and prepare GLWEs for accumulation.
|
||||
// 1. Finish the keyswitching computation partially performed with a GEMM:
|
||||
// - negate the dot product between the GLWE and KSK polynomial
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "types/int128.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include <limits>
|
||||
|
||||
template <typename T>
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "device.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
#include <mutex>
|
||||
@@ -6,6 +7,27 @@
|
||||
#include <cuda_profiler_api.h>
|
||||
#endif
|
||||
|
||||
void validate_device_ptr_and_gpu_index(const void *ptr, uint32_t gpu_index) {
|
||||
GPU_ASSERT(ptr != nullptr, "Cuda error: null device ptr");
|
||||
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
|
||||
if (attr.device != gpu_index || attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid device pointer.")
|
||||
}
|
||||
}
|
||||
|
||||
int validate_device_ptr(const void *ptr) {
|
||||
GPU_ASSERT(ptr != nullptr, "Cuda error: null device ptr");
|
||||
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
|
||||
if (attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid device pointer.")
|
||||
}
|
||||
return attr.device;
|
||||
}
|
||||
|
||||
uint32_t cuda_get_device() {
|
||||
int device;
|
||||
check_cuda_error(cudaGetDevice(&device));
|
||||
@@ -30,8 +52,9 @@ bool mem_pools_enabled = false;
|
||||
// better results.
|
||||
void cuda_setup_mempool(uint32_t caller_gpu_index) {
|
||||
if (!mem_pools_enabled) {
|
||||
pool_mutex.lock();
|
||||
if (mem_pools_enabled)
|
||||
std::lock_guard lock(pool_mutex);
|
||||
if (mem_pools_enabled) // double-check - mem_pools_enabled might have been
|
||||
// changed in a different thread
|
||||
return; // If mem pools are already enabled, we don't need to do anything
|
||||
|
||||
// We do it only once for all GPUs
|
||||
@@ -78,7 +101,6 @@ void cuda_setup_mempool(uint32_t caller_gpu_index) {
|
||||
}
|
||||
// We return to the original gpu_index
|
||||
cuda_set_device(caller_gpu_index);
|
||||
pool_mutex.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -234,60 +256,61 @@ bool cuda_check_support_thread_block_clusters() {
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Copy memory to the GPU asynchronously
|
||||
/// Copy memory from the CPU to a GPU with size tracking.
|
||||
/// This copy is asynchronous only if the CPU memory was pinned, i.e.
|
||||
/// allocated using cudaMallocHost. This was shown to come with a performance
|
||||
/// penalty if we allocate all CPU data in this way in the backend, so
|
||||
/// cudaMallocHost is only used in specific places where we need an
|
||||
/// asynchronous data copy from the CPU to all the GPUs simultaneously (for
|
||||
/// example to copy the bootstrapping key).
|
||||
/// The copy only happens if gpu_memory_allocated is true.
|
||||
void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
|
||||
uint64_t size,
|
||||
cudaStream_t stream,
|
||||
uint32_t gpu_index,
|
||||
bool gpu_memory_allocated) {
|
||||
|
||||
GPU_ASSERT(src != nullptr, "Cuda error: null device ptr");
|
||||
|
||||
if (size == 0 || !gpu_memory_allocated)
|
||||
return;
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
|
||||
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid device pointer in async copy to GPU.")
|
||||
}
|
||||
validate_device_ptr_and_gpu_index(dest, gpu_index);
|
||||
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
/// Copy memory to the GPU asynchronously
|
||||
/// Copy memory from the CPU to a GPU.
|
||||
/// This copy is asynchronous only if the CPU memory was pinned, i.e.
|
||||
/// allocated using cudaMallocHost. This was shown to come with a performance
|
||||
/// penalty if we allocate all CPU data in this way in the backend, so
|
||||
/// cudaMallocHost is only used in specific places where we need an
|
||||
/// asynchronous data copy from the CPU to all the GPUs simultaneously (for
|
||||
/// example to copy the bootstrapping key).
|
||||
void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index) {
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(dest, src, size, stream,
|
||||
gpu_index, true);
|
||||
}
|
||||
|
||||
/// Copy memory within a GPU asynchronously
|
||||
/// Copy memory within a GPU asynchronously.
|
||||
/// The copy only happens if gpu_memory_allocated is true
|
||||
void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
void *dest, void const *src, uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index, bool gpu_memory_allocated) {
|
||||
if (size == 0 || !gpu_memory_allocated)
|
||||
return;
|
||||
GPU_ASSERT(dest != nullptr,
|
||||
"Cuda error: trying to copy gpu->gpu to null ptr");
|
||||
GPU_ASSERT(src != nullptr,
|
||||
"Cuda error: trying to copy gpu->gpu from null ptr");
|
||||
|
||||
cudaPointerAttributes attr_dest;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
|
||||
PANIC_IF_FALSE(
|
||||
attr_dest.type == cudaMemoryTypeDevice,
|
||||
"Cuda error: invalid dest device pointer in copy from GPU to GPU.");
|
||||
cudaPointerAttributes attr_src;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
|
||||
PANIC_IF_FALSE(
|
||||
attr_src.type == cudaMemoryTypeDevice,
|
||||
"Cuda error: invalid src device pointer in copy from GPU to GPU.");
|
||||
int src_gpu_index = validate_device_ptr(src);
|
||||
int dest_gpu_index = validate_device_ptr(dest);
|
||||
cuda_set_device(gpu_index);
|
||||
if (attr_src.device == attr_dest.device) {
|
||||
if (src_gpu_index == dest_gpu_index) {
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
|
||||
} else {
|
||||
check_cuda_error(cudaMemcpyPeerAsync(dest, attr_dest.device, src,
|
||||
attr_src.device, size, stream));
|
||||
check_cuda_error(cudaMemcpyPeerAsync(dest, dest_gpu_index, src,
|
||||
src_gpu_index, size, stream));
|
||||
}
|
||||
}
|
||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
@@ -327,21 +350,20 @@ void cuda_synchronize_device(uint32_t gpu_index) {
|
||||
check_cuda_error(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
/// cuda_memset sets bytes, we basically only use it to initialize data to 0
|
||||
/// The memset only happens if gpu_memory_allocated is true
|
||||
void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
|
||||
uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index,
|
||||
bool gpu_memory_allocated) {
|
||||
if (size == 0 || !gpu_memory_allocated)
|
||||
return;
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
|
||||
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in cuda memset.")
|
||||
}
|
||||
validate_device_ptr_and_gpu_index(dest, gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
|
||||
}
|
||||
|
||||
/// cuda_memset sets bytes, we basically only use it to initialize data to 0
|
||||
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index) {
|
||||
cuda_memset_with_size_tracking_async(dest, val, size, stream, gpu_index,
|
||||
@@ -366,7 +388,7 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
}
|
||||
cuda_set_device(gpu_index);
|
||||
int block_size = 256;
|
||||
int num_blocks = (n + block_size - 1) / block_size;
|
||||
int num_blocks = CEIL_DIV(n, block_size);
|
||||
|
||||
// Launch the kernel
|
||||
cuda_set_value_kernel<Torus>
|
||||
@@ -384,15 +406,15 @@ template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t n);
|
||||
|
||||
/// Copy memory to the CPU asynchronously
|
||||
/// This comes with a big penalty on performance even if the CPU
|
||||
/// memory is pinned (using cudaMallocHost for the CPU allocation),
|
||||
/// so it should be avoided at all costs
|
||||
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index) {
|
||||
GPU_ASSERT(dest != nullptr, "Cuda error: null host ptr");
|
||||
if (size == 0)
|
||||
return;
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, src));
|
||||
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
|
||||
}
|
||||
validate_device_ptr_and_gpu_index(src, gpu_index);
|
||||
|
||||
cuda_set_device(gpu_index);
|
||||
check_cuda_error(
|
||||
|
||||
@@ -68,9 +68,15 @@ struct alignas(16) f128 {
|
||||
auto t = two_sum(a.lo, b.lo);
|
||||
|
||||
double hi = s.hi;
|
||||
#ifdef __CUDA_ARCH__
|
||||
double lo = __dadd_rn(s.lo, t.hi);
|
||||
hi = __dadd_rn(hi, lo);
|
||||
lo = __dsub_rn(lo, __dsub_rn(hi, s.hi));
|
||||
#else
|
||||
double lo = s.lo + t.hi;
|
||||
hi = hi + lo;
|
||||
lo = lo - (hi - s.hi);
|
||||
#endif
|
||||
|
||||
return f128(hi, lo + t.lo);
|
||||
}
|
||||
@@ -104,8 +110,13 @@ struct alignas(16) f128 {
|
||||
__host__ __device__ static f128 sub(const f128 &a, const f128 &b) {
|
||||
auto s = two_diff(a.hi, b.hi);
|
||||
auto t = two_diff(a.lo, b.lo);
|
||||
#ifdef __CUDA_ARCH__
|
||||
s = quick_two_sum(s.hi, __dadd_rn(s.lo, t.hi));
|
||||
return quick_two_sum(s.hi, __dadd_rn(s.lo, t.lo));
|
||||
#else
|
||||
s = quick_two_sum(s.hi, s.lo + t.hi);
|
||||
return quick_two_sum(s.hi, s.lo + t.lo);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Multiplication
|
||||
@@ -220,16 +231,16 @@ struct f128x2 {
|
||||
// Subtraction
|
||||
__host__ __device__ friend f128x2 operator-(const f128x2 &a,
|
||||
const f128x2 &b) {
|
||||
return f128x2(f128::add(a.re, f128(-b.re.hi, -b.re.lo)),
|
||||
f128::add(a.im, f128(-b.im.hi, -b.im.lo)));
|
||||
return f128x2(f128::sub_estimate(a.re, b.re),
|
||||
f128::sub_estimate(a.im, b.im));
|
||||
}
|
||||
|
||||
// Multiplication (complex multiplication)
|
||||
__host__ __device__ friend f128x2 operator*(const f128x2 &a,
|
||||
const f128x2 &b) {
|
||||
const f128 a_im_b_im = f128::mul(a.im, b.im);
|
||||
f128 real_part =
|
||||
f128::add(f128::mul(a.re, b.re),
|
||||
f128(-f128::mul(a.im, b.im).hi, -f128::mul(a.im, b.im).lo));
|
||||
f128::add(f128::mul(a.re, b.re), f128(-a_im_b_im.hi, -a_im_b_im.lo));
|
||||
f128 imag_part = f128::add(f128::mul(a.re, b.im), f128::mul(a.im, b.re));
|
||||
return f128x2(real_part, imag_part);
|
||||
}
|
||||
@@ -243,8 +254,8 @@ struct f128x2 {
|
||||
|
||||
// Subtraction-assignment operator
|
||||
__host__ __device__ f128x2 &operator-=(const f128x2 &other) {
|
||||
re = f128::add(re, f128(-other.re.hi, -other.re.lo));
|
||||
im = f128::add(im, f128(-other.im.hi, -other.im.lo));
|
||||
re = f128::sub_estimate(re, other.re);
|
||||
im = f128::sub_estimate(im, other.im);
|
||||
return *this;
|
||||
}
|
||||
|
||||
@@ -261,12 +272,20 @@ struct f128x2 {
|
||||
};
|
||||
|
||||
__host__ __device__ inline uint64_t double_to_bits(double d) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
uint64_t bits = __double_as_longlong(d);
|
||||
#else
|
||||
uint64_t bits = *reinterpret_cast<uint64_t *>(&d);
|
||||
#endif
|
||||
return bits;
|
||||
}
|
||||
|
||||
__host__ __device__ inline double bits_to_double(uint64_t bits) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
double d = __longlong_as_double(bits);
|
||||
#else
|
||||
double d = *reinterpret_cast<double *>(&bits);
|
||||
#endif
|
||||
return d;
|
||||
}
|
||||
|
||||
@@ -275,6 +294,8 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
|
||||
const double A = ONE << 52;
|
||||
const double B = ONE << 104;
|
||||
const double C = ONE << 76;
|
||||
// NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
|
||||
// does not work here
|
||||
const double D = 340282366920938500000000000000000000000.;
|
||||
|
||||
const __uint128_t threshold = (ONE << 104);
|
||||
@@ -288,15 +309,20 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
|
||||
|
||||
uint64_t bits_l = A_bits | lower64;
|
||||
double l_temp = bits_to_double(bits_l);
|
||||
double l = l_temp - A;
|
||||
|
||||
uint64_t B_bits = double_to_bits(B);
|
||||
uint64_t top64 = static_cast<uint64_t>(x >> 52);
|
||||
uint64_t bits_h = B_bits | top64;
|
||||
double h_temp = bits_to_double(bits_h);
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
return __dadd_rn(__dsub_rn(l_temp, A), __dsub_rn(h_temp, B));
|
||||
#else
|
||||
double l = l_temp - A;
|
||||
double h = h_temp - B;
|
||||
|
||||
return (l + h);
|
||||
#endif
|
||||
|
||||
} else {
|
||||
uint64_t C_bits = double_to_bits(C);
|
||||
@@ -310,15 +336,20 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
|
||||
|
||||
uint64_t bits_l = C_bits | lower64 | mask_part;
|
||||
double l_temp = bits_to_double(bits_l);
|
||||
double l = l_temp - C;
|
||||
|
||||
uint64_t D_bits = double_to_bits(D);
|
||||
uint64_t top64 = static_cast<uint64_t>(x >> 76);
|
||||
uint64_t bits_h = D_bits | top64;
|
||||
double h_temp = bits_to_double(bits_h);
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
return __dadd_rn(__dsub_rn(l_temp, C), __dsub_rn(h_temp, D));
|
||||
#else
|
||||
double l = l_temp - C;
|
||||
double h = h_temp - D;
|
||||
|
||||
return (l + h);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -389,6 +420,8 @@ __host__ __device__ inline f128 u128_to_signed_to_f128(__uint128_t x) {
|
||||
|
||||
__host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
|
||||
auto x = f128::sub_estimate(a, f128::f128_floor(a));
|
||||
// NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
|
||||
// does not work here
|
||||
const double normalization = 340282366920938500000000000000000000000.;
|
||||
#ifdef __CUDA_ARCH__
|
||||
x.hi = __dmul_rn(x.hi, normalization);
|
||||
@@ -398,7 +431,7 @@ __host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
|
||||
x.lo *= normalization;
|
||||
#endif
|
||||
|
||||
// TODO has to be round
|
||||
x = f128::add_estimate(x, f128(0.5, 0.0));
|
||||
x = f128::f128_floor(x);
|
||||
|
||||
__uint128_t x0 = f64_to_u128(x.hi);
|
||||
|
||||
@@ -12,8 +12,9 @@
|
||||
using Index = unsigned;
|
||||
|
||||
#define NEG_TWID(i) \
|
||||
f128x2(f128(neg_twiddles_re_hi[(i)], neg_twiddles_re_lo[(i)]), \
|
||||
f128(neg_twiddles_im_hi[(i)], neg_twiddles_im_lo[(i)]))
|
||||
f128x2( \
|
||||
f128(__ldg(&neg_twiddles_re_hi[(i)]), __ldg(&neg_twiddles_re_lo[(i)])), \
|
||||
f128(__ldg(&neg_twiddles_im_hi[(i)]), __ldg(&neg_twiddles_im_lo[(i)])))
|
||||
|
||||
#define F64x4_TO_F128x2(f128x2_reg, ind) \
|
||||
f128x2_reg.re.hi = dt_re_hi[ind]; \
|
||||
@@ -75,7 +76,11 @@ __device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
|
||||
Index rank = tid & thread_mask;
|
||||
bool u_stays_in_register = rank < lane_mask;
|
||||
F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
|
||||
if (u_stays_in_register) {
|
||||
F128x2_TO_F64x4(v[i], tid);
|
||||
} else {
|
||||
F128x2_TO_F64x4(u[i], tid);
|
||||
}
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
__syncthreads();
|
||||
@@ -86,8 +91,11 @@ __device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
|
||||
Index rank = tid & thread_mask;
|
||||
bool u_stays_in_register = rank < lane_mask;
|
||||
F64x4_TO_F128x2(w, tid ^ lane_mask);
|
||||
u[i] = (u_stays_in_register) ? u[i] : w;
|
||||
v[i] = (u_stays_in_register) ? w : v[i];
|
||||
if (u_stays_in_register) {
|
||||
v[i] = w;
|
||||
} else {
|
||||
u[i] = w;
|
||||
}
|
||||
w = NEG_TWID(tid / lane_mask + twiddle_shift);
|
||||
f128::cplx_f128_mul_assign(w.re, w.im, v[i].re, v[i].im, w.re, w.im);
|
||||
f128::cplx_f128_sub_assign(v[i].re, v[i].im, u[i].re, u[i].im, w.re,
|
||||
@@ -151,7 +159,11 @@ __device__ void negacyclic_backward_fft_f128(double *dt_re_hi, double *dt_re_lo,
|
||||
// keep one of the register for next iteration and store another one in sm
|
||||
Index rank = tid & thread_mask;
|
||||
bool u_stays_in_register = rank < lane_mask;
|
||||
F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
|
||||
if (u_stays_in_register) {
|
||||
F128x2_TO_F64x4(v[i], tid);
|
||||
} else {
|
||||
F128x2_TO_F64x4(u[i], tid);
|
||||
}
|
||||
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
@@ -165,8 +177,11 @@ __device__ void negacyclic_backward_fft_f128(double *dt_re_hi, double *dt_re_lo,
|
||||
bool u_stays_in_register = rank < lane_mask;
|
||||
F64x4_TO_F128x2(w, tid ^ lane_mask);
|
||||
|
||||
u[i] = (u_stays_in_register) ? u[i] : w;
|
||||
v[i] = (u_stays_in_register) ? w : v[i];
|
||||
if (u_stays_in_register) {
|
||||
v[i] = w;
|
||||
} else {
|
||||
u[i] = w;
|
||||
}
|
||||
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
@@ -218,7 +233,7 @@ __device__ void convert_u128_to_f128_as_torus(
|
||||
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
|
||||
const __uint128_t *in_re, const __uint128_t *in_im) {
|
||||
|
||||
const double normalization = pow(2., -128.);
|
||||
const double normalization = __longlong_as_double(0x37f0000000000000ULL);
|
||||
Index tid = threadIdx.x;
|
||||
// #pragma unroll
|
||||
for (Index i = 0; i < params::opt / 2; i++) {
|
||||
@@ -241,7 +256,7 @@ __device__ void convert_u128_on_regs_to_f128_as_torus(
|
||||
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
|
||||
const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {
|
||||
|
||||
const double normalization = pow(2., -128.);
|
||||
const double normalization = __longlong_as_double(0x37f0000000000000ULL);
|
||||
Index tid = threadIdx.x;
|
||||
// #pragma unroll
|
||||
for (Index i = 0; i < params::opt / 2; i++) {
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
#include "integer/subtraction.cuh"
|
||||
#include "pbs/programmable_bootstrap_classic.cuh"
|
||||
#include "pbs/programmable_bootstrap_multibit.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
|
||||
// lwe_dimension + 1 threads
|
||||
// todo: This kernel MUST be refactored to a binary reduction
|
||||
@@ -98,7 +98,7 @@ __host__ void are_all_comparisons_block_true(
|
||||
|
||||
while (remaining_blocks > 0) {
|
||||
// Split in max_value chunks
|
||||
int num_chunks = (remaining_blocks + max_value - 1) / max_value;
|
||||
int num_chunks = CEIL_DIV(remaining_blocks, max_value);
|
||||
|
||||
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
|
||||
// as in the worst case we will be adding `max_value` ones
|
||||
@@ -222,7 +222,7 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
uint32_t remaining_blocks = num_radix_blocks;
|
||||
while (remaining_blocks > 0) {
|
||||
// Split in max_value chunks
|
||||
int num_chunks = (remaining_blocks + max_value - 1) / max_value;
|
||||
int num_chunks = CEIL_DIV(remaining_blocks, max_value);
|
||||
|
||||
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
|
||||
// as in the worst case we will be adding `max_value` ones
|
||||
|
||||
@@ -10,7 +10,122 @@
|
||||
#include "integer/integer.cuh"
|
||||
#include "linearalgebra/multiplication.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* GPU Compression/Decompression Algorithm: Overview
|
||||
* =============================================================================
|
||||
*
|
||||
* The compression algorithm transforms standard LWE ciphertexts into a compact
|
||||
* packed format. Decompression reverses this process.
|
||||
*
|
||||
* -----------------------------------------------------------------------------
|
||||
* COMPRESSION INPUT (lwe_array_in)
|
||||
* -----------------------------------------------------------------------------
|
||||
*
|
||||
* +-------------------------------------------------------------------------+
|
||||
* | lwe_array_in (GPU memory) |
|
||||
* +-------------------------------------------------------------------------+
|
||||
* +---------------------------+---------------------------+-----------------+
|
||||
* | LWE 0 | LWE 1 | ... |
|
||||
* | [mask, body] | [mask, body] | |
|
||||
* +---------------------------+---------------------------+-----------------+
|
||||
* |<-- lwe_dimension + 1 -->|
|
||||
*
|
||||
* Total LWEs: total_lwe_bodies_count (num_radix_blocks)
|
||||
*
|
||||
* -----------------------------------------------------------------------------
|
||||
* COMPRESSION PROCESS
|
||||
* -----------------------------------------------------------------------------
|
||||
*
|
||||
* 1. Message Shift (64-bit only):
|
||||
* Each LWE is multiplied by message_modulus to shift the message to MSB
|
||||
*
|
||||
* 2. Packing Keyswitch (LWE -> GLWE):
|
||||
* Groups of up to lwe_per_glwe LWEs are packed into a single GLWE:
|
||||
*
|
||||
* +--------------------------------------------------------------+
|
||||
* | lwe_per_glwe LWEs (input batch) |
|
||||
* | LWE[0], LWE[1], ..., LWE[lwe_per_glwe-1] |
|
||||
* +--------------------------------------------------------------+
|
||||
* |
|
||||
* Packing Keyswitch
|
||||
* v
|
||||
* +--------------------------------------------------------------+
|
||||
* | Single GLWE Ciphertext |
|
||||
* | [A_0, A_1, ..., A_{k-1}, B] |
|
||||
* | |<-- k * polynomial_size -->| |<-- polynomial_size -->| |
|
||||
* +--------------------------------------------------------------+
|
||||
*
|
||||
* Number of output GLWEs: num_glwes = ceil(total_lwe_bodies_count /
|
||||
* lwe_per_glwe)
|
||||
*
|
||||
* 3. Modulus Switch:
|
||||
* Reduce precision from 64-bit torus to storage_log_modulus bits
|
||||
*
|
||||
* 4. Bit Packing:
|
||||
* Pack multiple reduced-precision elements into dense bit representation
|
||||
*
|
||||
* -----------------------------------------------------------------------------
|
||||
* COMPRESSION MEMORY LAYOUT (tmp_glwe_array_out)
|
||||
* -----------------------------------------------------------------------------
|
||||
*
|
||||
* +-------------------------------------------------------------------------+
|
||||
* | tmp_glwe_array_out (intermediate buffer) |
|
||||
* +-------------------------------------------------------------------------+
|
||||
* +----------------------------+----------------------------+---------------+
|
||||
* | GLWE 0 | GLWE 1 | ... |
|
||||
* | [A_0..A_{k-1}, B_0..B_N] | [A_0..A_{k-1}, B_0..B_N] | |
|
||||
* +----------------------------+----------------------------+---------------+
|
||||
* |<-- glwe_accumulator_size = (k+1)*N -->|
|
||||
*
|
||||
* Total size needed: num_glwes * glwe_accumulator_size elements
|
||||
* Where: num_glwes = ceil(total_lwe_bodies_count / lwe_per_glwe)
|
||||
*
|
||||
* -----------------------------------------------------------------------------
|
||||
* PACKED OUTPUT (glwe_array_out)
|
||||
* -----------------------------------------------------------------------------
|
||||
*
|
||||
* +-------------------------------------------------------------------------+
|
||||
* | Packed GLWE Ciphertext List (bit-packed) |
|
||||
* +-------------------------------------------------------------------------+
|
||||
* +-------------------------------------------------------------------------+
|
||||
* | Elements packed with storage_log_modulus bits per original element |
|
||||
* | Total packed size: ceil(in_len * storage_log_modulus / 64) elements |
|
||||
* +-------------------------------------------------------------------------+
|
||||
*
|
||||
* =============================================================================
|
||||
* DECOMPRESSION (Extract) Algorithm
|
||||
* =============================================================================
|
||||
*
|
||||
* Decompression receives an array of LWE indexes. For each index, it identifies
|
||||
* the corresponding GLWE, extracts that GLWE from the packed representation,
|
||||
* and then sample-extracts the requested LWE from the GLWE.
|
||||
*
|
||||
* -----------------------------------------------------------------------------
|
||||
* EXTRACT OUTPUT LAYOUT (glwe_array_out in host_extract)
|
||||
* -----------------------------------------------------------------------------
|
||||
*
|
||||
* +-------------------------------------------------------------------------+
|
||||
* | Extracted GLWE Ciphertext |
|
||||
* +-------------------------------------------------------------------------+
|
||||
* +---------------------------------------+-----------------+---------------+
|
||||
* | Mask (A polynomials) | Body (B) | Tail |
|
||||
* | [A_0, ..., A_{k-1}] | (body_count) | (zeroed) |
|
||||
* | k * polynomial_size elements | elements | elements |
|
||||
* +---------------------------------------+-----------------+---------------+
|
||||
* |<------------------- initial_out_len ------------------->| |
|
||||
* |<------------------------ glwe_ciphertext_size ------------------------->|
|
||||
*
|
||||
* For the last GLWE, body_count may be less than polynomial_size (partial).
|
||||
* The tail region must be zeroed to ensure defined behavior.
|
||||
*
|
||||
* tail_size = glwe_ciphertext_size - initial_out_len
|
||||
* tail_offset = initial_out_len (NOT 0!)
|
||||
*
|
||||
* =============================================================================
|
||||
*/
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
|
||||
@@ -66,7 +181,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
|
||||
|
||||
// number_bits_to_pack.div_ceil(Scalar::BITS)
|
||||
auto nbits = sizeof(Torus) * 8;
|
||||
auto out_len = (number_bits_to_pack + nbits - 1) / nbits;
|
||||
auto out_len = CEIL_DIV(number_bits_to_pack, nbits);
|
||||
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
getNumBlocksAndThreads(out_len, 1024, num_blocks, num_threads);
|
||||
@@ -108,6 +223,8 @@ host_integer_compress(CudaStreams streams,
|
||||
uint32_t num_glwes = (glwe_array_out->total_lwe_bodies_count +
|
||||
glwe_array_out->lwe_per_glwe - 1) /
|
||||
glwe_array_out->lwe_per_glwe;
|
||||
PANIC_IF_FALSE(num_glwes <= mem_ptr->max_num_glwes,
|
||||
"Invalid number of GLWEs");
|
||||
|
||||
// Keyswitch LWEs to GLWE
|
||||
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
|
||||
@@ -200,8 +317,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
|
||||
auto glwe_ciphertext_size = (glwe_dimension + 1) * polynomial_size;
|
||||
|
||||
uint32_t num_glwes =
|
||||
(total_lwe_bodies_count + polynomial_size - 1) / polynomial_size;
|
||||
uint32_t num_glwes = CEIL_DIV(total_lwe_bodies_count, polynomial_size);
|
||||
|
||||
// Compressed length of the compressed GLWE we want to extract
|
||||
uint32_t body_count = 0;
|
||||
@@ -218,19 +334,21 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
|
||||
uint32_t initial_out_len = glwe_dimension * polynomial_size + body_count;
|
||||
|
||||
// Calculates how many bits this particular GLWE shall use
|
||||
auto number_bits_to_unpack = initial_out_len * log_modulus;
|
||||
auto nbits = sizeof(Torus) * 8;
|
||||
|
||||
// Calculates how many bits a full-packed GLWE shall use
|
||||
number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
|
||||
auto len = (number_bits_to_unpack + nbits - 1) / nbits;
|
||||
// Calculate how many bits a full-packed GLWE uses, to determine
|
||||
// the stride between consecutive packed GLWEs in the input buffer
|
||||
auto number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
|
||||
auto len = CEIL_DIV(number_bits_to_unpack, nbits);
|
||||
// Uses that length to set the input pointer
|
||||
auto chunk_array_in = (Torus *)array_in->ptr + glwe_index * len;
|
||||
|
||||
// Ensure the tail of the GLWE is zeroed
|
||||
// The extract kernel writes initial_out_len elements starting at offset 0.
|
||||
// We must zero the tail region (from initial_out_len to
|
||||
// glwe_ciphertext_size)
|
||||
if (initial_out_len < glwe_ciphertext_size) {
|
||||
cuda_memset_async(glwe_array_out, 0,
|
||||
cuda_memset_async(glwe_array_out + initial_out_len, 0,
|
||||
(glwe_ciphertext_size - initial_out_len) * sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
}
|
||||
@@ -368,7 +486,7 @@ host_integer_decompress(CudaStreams streams,
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
active_streams, lwe_array_in_vec, extracted_lwe, lut->lwe_indexes_in,
|
||||
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
|
||||
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, lut->event_pool,
|
||||
lut->active_streams.count(), num_blocks_to_decompress,
|
||||
compression_params.small_lwe_dimension + 1);
|
||||
|
||||
@@ -388,7 +506,7 @@ host_integer_decompress(CudaStreams streams,
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
active_streams, (Torus *)d_lwe_array_out->ptr, lwe_after_pbs_vec,
|
||||
lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, num_blocks_to_decompress,
|
||||
lut->lwe_aligned_vec, lut->event_pool, num_blocks_to_decompress,
|
||||
encryption_params.big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/helper_multi_gpu.cuh"
|
||||
#include "utils/helper_profile.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
|
||||
@@ -273,8 +272,7 @@ __global__ void device_radix_split_simulators_and_grouping_pgns(
|
||||
}
|
||||
}
|
||||
|
||||
if ((blockIdx.x / group_size + 1) <
|
||||
(blocks_count + group_size - 1) / group_size) {
|
||||
if ((blockIdx.x / group_size + 1) < CEIL_DIV(blocks_count, group_size)) {
|
||||
size_t src_offset = (blockIdx.x + group_size - 1) * lwe_size;
|
||||
size_t pgns_offset = (blockIdx.x / group_size) * lwe_size;
|
||||
for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
|
||||
@@ -363,7 +361,7 @@ __host__ void host_radix_sum_in_groups(cudaStream_t stream, uint32_t gpu_index,
|
||||
num_radix_blocks > src1->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks should have more "
|
||||
"blocks than the number used in sum in groups")
|
||||
auto num_groups = (num_radix_blocks + group_size - 1) / group_size;
|
||||
auto num_groups = CEIL_DIV(num_radix_blocks, group_size);
|
||||
if (src2->num_radix_blocks < num_groups)
|
||||
PANIC("Cuda error: second input in sum in groups should have at least "
|
||||
"num_groups blocks")
|
||||
@@ -570,8 +568,8 @@ __host__ void integer_radix_apply_univariate_lookup_table(
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
active_streams, lwe_array_in_vec, (Torus *)lwe_array_in->ptr,
|
||||
lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
|
||||
big_lwe_dimension + 1);
|
||||
lut->lwe_aligned_vec, lut->event_pool, lut->active_streams.count(),
|
||||
num_radix_blocks, big_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch_async<Torus>(
|
||||
@@ -594,7 +592,8 @@ __host__ void integer_radix_apply_univariate_lookup_table(
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
active_streams, (Torus *)lwe_array_out->ptr, lwe_after_pbs_vec,
|
||||
lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
|
||||
lut->lwe_aligned_vec, lut->event_pool, num_radix_blocks,
|
||||
big_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
|
||||
active_streams);
|
||||
@@ -674,8 +673,8 @@ __host__ void integer_radix_apply_many_univariate_lookup_table(
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
active_streams, lwe_array_in_vec, (Torus *)lwe_array_in->ptr,
|
||||
lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
|
||||
big_lwe_dimension + 1);
|
||||
lut->lwe_aligned_vec, lut->event_pool, lut->active_streams.count(),
|
||||
num_radix_blocks, big_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch_async<Torus>(
|
||||
@@ -791,8 +790,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
active_streams, lwe_array_in_vec, (Torus *)lwe_array_pbs_in->ptr,
|
||||
lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
|
||||
big_lwe_dimension + 1);
|
||||
lut->lwe_aligned_vec, lut->event_pool, lut->active_streams.count(),
|
||||
num_radix_blocks, big_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch_async<Torus>(
|
||||
@@ -815,7 +814,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
active_streams, (Torus *)(lwe_array_out->ptr), lwe_after_pbs_vec,
|
||||
lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
|
||||
lut->lwe_aligned_vec, lut->event_pool, num_radix_blocks,
|
||||
big_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
|
||||
active_streams);
|
||||
@@ -1067,85 +1067,6 @@ void generate_device_accumulator_bivariate(
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
template <typename Torus> struct int_lut_cache {
|
||||
int_lut_cache() {}
|
||||
|
||||
Torus *get_cached_univariate_lut(std::function<Torus(Torus)> &f, uint64_t *degree,
|
||||
uint64_t *max_degree, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t input_message_modulus,
|
||||
uint32_t input_carry_modulus,
|
||||
uint32_t output_message_modulus,
|
||||
uint32_t output_carry_modulus) {
|
||||
/*__int128_t f_hash = 0;
|
||||
uint32_t bits_per_lut_val = 5;
|
||||
uint32_t input_modulus_sup = input_message_modulus * input_carry_modulus;
|
||||
for (uint32_t i = 0; i < input_modulus_sup; ++i) {
|
||||
Torus f_eval = f(i);
|
||||
GPU_ASSERT(f_eval < (1 << bits_per_lut_val),
|
||||
"LUT value expected bitwidth overflow");
|
||||
f_hash |= f_eval;
|
||||
f_hash <<= bits_per_lut_val;
|
||||
}
|
||||
|
||||
std::lock_guard cache_lock(_mutex);
|
||||
if (_lut_cache.find(f_hash) != _lut_cache.end()) {
|
||||
lut_ptr &ptr = _lut_cache[f_hash];
|
||||
GPU_ASSERT(ptr.output_message_modulus == output_message_modulus,
|
||||
"Error modulus");
|
||||
GPU_ASSERT(ptr.input_message_modulus == input_message_modulus,
|
||||
"Error modulus");
|
||||
GPU_ASSERT(ptr.glwe_dimension == glwe_dimension, "Error modulus");
|
||||
*max_degree = ptr.max_degree;
|
||||
*degree = ptr.degree;
|
||||
return ptr.ptr;
|
||||
}*/
|
||||
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
|
||||
*max_degree = input_message_modulus * input_carry_modulus - 1;
|
||||
*degree = generate_lookup_table_with_encoding<Torus>(
|
||||
h_lut, glwe_dimension, polynomial_size, input_message_modulus,
|
||||
input_carry_modulus, output_message_modulus, output_carry_modulus, f);
|
||||
|
||||
/*lut_ptr new_ptr = {h_lut,
|
||||
glwe_dimension,
|
||||
input_message_modulus,
|
||||
input_carry_modulus,
|
||||
output_message_modulus,
|
||||
output_carry_modulus,
|
||||
*max_degree,
|
||||
*degree};*/
|
||||
//_lut_cache[f_hash] = new_ptr;
|
||||
return h_lut;
|
||||
}
|
||||
|
||||
~int_lut_cache() {
|
||||
std::lock_guard cache_lock(_mutex);
|
||||
for (auto v : _lut_cache) {
|
||||
free(v.second.ptr);
|
||||
}
|
||||
_lut_cache.clear();
|
||||
}
|
||||
|
||||
private:
|
||||
struct lut_ptr {
|
||||
Torus *ptr;
|
||||
uint32_t glwe_dimension;
|
||||
uint32_t input_message_modulus;
|
||||
uint32_t input_carry_modulus;
|
||||
uint32_t output_message_modulus;
|
||||
uint32_t output_carry_modulus;
|
||||
uint64_t max_degree;
|
||||
uint64_t degree;
|
||||
};
|
||||
std::map<__int128_t, lut_ptr> _lut_cache;
|
||||
std::mutex _mutex;
|
||||
};
|
||||
static int_lut_cache<uint64_t> g_LutCache64;
|
||||
|
||||
/*
|
||||
* generate bivariate accumulator with factor scaling for device pointer
|
||||
* v_stream - cuda stream
|
||||
@@ -1177,8 +1098,8 @@ void generate_device_accumulator_bivariate_with_factor(
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
|
||||
// cuda_synchronize_stream(stream, gpu_index);
|
||||
// free(h_lut);
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
}
|
||||
/*
|
||||
* generate bivariate accumulator for device pointer
|
||||
@@ -1224,36 +1145,23 @@ void generate_device_accumulator_with_encoding(
|
||||
uint32_t output_message_modulus, uint32_t output_carry_modulus,
|
||||
std::function<Torus(Torus)> f, bool gpu_memory_allocated) {
|
||||
|
||||
static constexpr auto is_u64 = std::is_same_v<Torus, uint64_t>;
|
||||
Torus *h_lut = nullptr;
|
||||
// host lut
|
||||
if constexpr (is_u64) {
|
||||
h_lut = g_LutCache64.get_cached_univariate_lut(
|
||||
f, degree, max_degree, glwe_dimension, polynomial_size,
|
||||
input_message_modulus, input_carry_modulus, output_message_modulus,
|
||||
output_carry_modulus);
|
||||
} else {
|
||||
h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
|
||||
*max_degree = input_message_modulus * input_carry_modulus - 1;
|
||||
// fill accumulator
|
||||
*degree = generate_lookup_table_with_encoding<Torus>(
|
||||
h_lut, glwe_dimension, polynomial_size, input_message_modulus,
|
||||
input_carry_modulus, output_message_modulus, output_carry_modulus, f);
|
||||
|
||||
*max_degree = input_message_modulus * input_carry_modulus - 1;
|
||||
// fill accumulator
|
||||
*degree = generate_lookup_table_with_encoding<Torus>(
|
||||
h_lut, glwe_dimension, polynomial_size, input_message_modulus,
|
||||
input_carry_modulus, output_message_modulus, output_carry_modulus, f);
|
||||
}
|
||||
/*
|
||||
// copy host lut and lut_indexes_vec to device
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
|
||||
stream, gpu_index, gpu_memory_allocated);
|
||||
*/
|
||||
if (!std::is_same_v<Torus, uint64_t>) {
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
}
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void generate_device_accumulator_with_encoding_with_cpu_prealloc(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
|
||||
@@ -1356,8 +1264,8 @@ void generate_many_lut_device_accumulator(
|
||||
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
|
||||
stream, gpu_index, gpu_memory_allocated);
|
||||
|
||||
//cuda_synchronize_stream(stream, gpu_index);
|
||||
//free(h_lut);
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
@@ -2434,7 +2342,7 @@ integer_radix_apply_noise_squashing(CudaStreams streams,
|
||||
|
||||
// Since the radix ciphertexts are packed, we have to use the num_radix_blocks
|
||||
// from the output ct
|
||||
auto active_streams = streams.active_gpu_subset(
|
||||
auto active_streams = streams.active_gpu_subset_u128(
|
||||
lwe_array_out->num_radix_blocks, params.pbs_type);
|
||||
if (active_streams.count() == 1) {
|
||||
execute_keyswitch_async<InputTorus>(
|
||||
@@ -2465,8 +2373,9 @@ integer_radix_apply_noise_squashing(CudaStreams streams,
|
||||
multi_gpu_scatter_lwe_async<InputTorus>(
|
||||
active_streams, lwe_array_in_vec, (InputTorus *)lwe_array_pbs_in->ptr,
|
||||
lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_scatter_vec, lut->active_streams.count(),
|
||||
lwe_array_out->num_radix_blocks, lut->input_big_lwe_dimension + 1);
|
||||
lut->lwe_aligned_scatter_vec, lut->event_pool,
|
||||
lut->active_streams.count(), lwe_array_out->num_radix_blocks,
|
||||
lut->input_big_lwe_dimension + 1);
|
||||
|
||||
execute_keyswitch_async<InputTorus>(
|
||||
active_streams, lwe_after_ks_vec, lwe_trivial_indexes_vec,
|
||||
@@ -2489,7 +2398,8 @@ integer_radix_apply_noise_squashing(CudaStreams streams,
|
||||
multi_gpu_gather_lwe_async<__uint128_t>(
|
||||
active_streams, (__uint128_t *)lwe_array_out->ptr, lwe_after_pbs_vec,
|
||||
nullptr, lut->using_trivial_lwe_indexes, lut->lwe_aligned_gather_vec,
|
||||
lwe_array_out->num_radix_blocks, big_lwe_dimension + 1);
|
||||
lut->event_pool, lwe_array_out->num_radix_blocks,
|
||||
big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
streams.synchronize();
|
||||
|
||||
@@ -362,7 +362,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
|
||||
radix_columns current_columns(current_blocks->degrees, num_radix_blocks,
|
||||
num_radix_in_vec, chunk_size, needs_processing);
|
||||
int number_of_threads = std::min(256, (int)mem_ptr->params.polynomial_size);
|
||||
int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
|
||||
int part_count = CEIL_DIV(big_lwe_size, number_of_threads);
|
||||
const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
|
||||
|
||||
mem_ptr->setup_lookup_tables(streams, num_radix_in_vec,
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#include "device.h"
|
||||
#include "integer/integer.cuh"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
@@ -54,7 +54,7 @@ void host_integer_grouped_oprf(CudaStreams streams,
|
||||
PUSH_RANGE("scatter")
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
active_streams, lwe_array_in_vec, seeded_lwe_input, lut->lwe_indexes_in,
|
||||
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
|
||||
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, lut->event_pool,
|
||||
active_streams.count(), num_blocks_to_process,
|
||||
mem_ptr->params.small_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
@@ -72,7 +72,7 @@ void host_integer_grouped_oprf(CudaStreams streams,
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
active_streams, (Torus *)radix_lwe_out->ptr, lwe_after_pbs_vec,
|
||||
lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, num_blocks_to_process,
|
||||
lut->lwe_aligned_vec, lut->event_pool, num_blocks_to_process,
|
||||
mem_ptr->params.big_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
#include "device.h"
|
||||
#include "integer/integer.h"
|
||||
#include "integer/radix_ciphertext.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/helper_profile.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
inline CudaLweCiphertextListFFI
|
||||
to_lwe_ciphertext_list(CudaRadixCiphertextFFI *radix) {
|
||||
|
||||
@@ -5,8 +5,8 @@
|
||||
#include "integer/radix_ciphertext.h"
|
||||
#include "integer/rerand.h"
|
||||
#include "integer/rerand_utilities.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/helper_profile.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include "zk/zk_utilities.h"
|
||||
|
||||
template <typename Torus, class params>
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
#include "device.h"
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include <stdio.h>
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
#include "integer/scalar_mul.h"
|
||||
#include "multiplication.cuh"
|
||||
#include "scalar_shifts.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include <stdio.h>
|
||||
|
||||
template <typename T>
|
||||
|
||||
@@ -116,7 +116,7 @@ __host__ void host_integer_overflowing_sub(
|
||||
radix_params.message_modulus * radix_params.carry_modulus;
|
||||
uint32_t num_bits_in_block = log2_int(block_modulus);
|
||||
uint32_t grouping_size = num_bits_in_block;
|
||||
uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
|
||||
uint32_t num_groups = CEIL_DIV(num_blocks, grouping_size);
|
||||
|
||||
host_unchecked_sub_with_correcting_term<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), output, input_left, input_right,
|
||||
|
||||
@@ -175,14 +175,17 @@ __host__ void host_aggregate_one_hot_vector(
|
||||
Torus *const *ksks) {
|
||||
|
||||
int_radix_params params = mem_ptr->params;
|
||||
if (params.message_modulus > 4 && params.carry_modulus > 4) {
|
||||
PANIC("Cuda error: aggregate one hot vector is only implemented for 1_1 "
|
||||
"and 2_2 params");
|
||||
}
|
||||
uint32_t chunk_size = mem_ptr->chunk_size;
|
||||
uint32_t num_streams = mem_ptr->num_streams;
|
||||
|
||||
mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
|
||||
streams);
|
||||
|
||||
uint32_t inputs_per_stream =
|
||||
(num_input_ciphertexts + num_streams - 1) / num_streams;
|
||||
uint32_t inputs_per_stream = CEIL_DIV(num_input_ciphertexts, num_streams);
|
||||
|
||||
for (uint32_t s = 0; s < num_streams; s++) {
|
||||
|
||||
@@ -210,7 +213,7 @@ __host__ void host_aggregate_one_hot_vector(
|
||||
if (count_in_stream == 0)
|
||||
continue;
|
||||
|
||||
uint32_t num_chunks = (count_in_stream + chunk_size - 1) / chunk_size;
|
||||
uint32_t num_chunks = CEIL_DIV(count_in_stream, chunk_size);
|
||||
|
||||
//
|
||||
// Process chunks of input ciphertexts for the current stream
|
||||
@@ -255,7 +258,10 @@ __host__ void host_aggregate_one_hot_vector(
|
||||
|
||||
//
|
||||
// Aggregate partial results from all streams into the final aggregated vector
|
||||
// num_streams has to be less than the max noise level otherwise we accumulate
|
||||
// too much and the noise limit is exceeded
|
||||
//
|
||||
CHECK_NOISE_LEVEL(num_streams, params.message_modulus, params.carry_modulus);
|
||||
for (uint32_t s = 1; s < num_streams; s++) {
|
||||
uint32_t start_idx = s * inputs_per_stream;
|
||||
if (start_idx >= num_input_ciphertexts)
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "integer/integer.h"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include <stdio.h>
|
||||
|
||||
template <typename T>
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "../utils/kernel_dimensions.cuh"
|
||||
#include "../utils/helper.cuh"
|
||||
#include "device.h"
|
||||
#include "linear_algebra.h"
|
||||
#include <fstream>
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "../utils/kernel_dimensions.cuh"
|
||||
#include "../utils/helper.cuh"
|
||||
#include "device.h"
|
||||
#include "linear_algebra.h"
|
||||
|
||||
|
||||
@@ -211,6 +211,8 @@ __global__ void device_programmable_bootstrap_amortized(
|
||||
// For the mask it's more complicated
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
|
||||
glwe_dimension);
|
||||
|
||||
// No need to sync here, it is already synchronized after add_to_torus
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
|
||||
glwe_dimension);
|
||||
}
|
||||
@@ -303,7 +305,9 @@ __host__ void host_programmable_bootstrap_amortized(
|
||||
int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
PANIC_IF_FALSE(sizeof(Torus) == 8,
|
||||
"Error: Programmable bootstrap amortized only supports 64-bit "
|
||||
"Torus type.");
|
||||
uint64_t SM_FULL =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
|
||||
@@ -173,6 +173,7 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
}
|
||||
}
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
__syncthreads();
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
if (num_many_lut > 1) {
|
||||
for (int i = 1; i < num_many_lut; i++) {
|
||||
@@ -184,7 +185,8 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
// No need to sync, it is already synchronized before the first
|
||||
// sample_extract_body call
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
}
|
||||
@@ -247,7 +249,9 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
PANIC_IF_FALSE(sizeof(Torus) == 8,
|
||||
"Error: Programmable bootstrap cg only supports 64-bit "
|
||||
"Torus type.");
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
// duplicate data for each
|
||||
uint64_t full_sm =
|
||||
|
||||
@@ -151,9 +151,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
}
|
||||
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
|
||||
__syncthreads();
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
|
||||
if (num_many_lut > 1) {
|
||||
for (int i = 1; i < num_many_lut; i++) {
|
||||
|
||||
@@ -165,6 +164,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
// No need to sync, it is already synchronized before the first
|
||||
// sample_extract_body call
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
}
|
||||
@@ -302,7 +303,10 @@ __host__ void execute_cg_external_product_loop(
|
||||
uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
PANIC_IF_FALSE(
|
||||
sizeof(Torus) == 8,
|
||||
"Error: Programmable bootstrap multi-bit cg only supports 64-bit "
|
||||
"Torus type.");
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
@@ -239,6 +239,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
}
|
||||
}
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
__syncthreads();
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
if (num_many_lut > 1) {
|
||||
for (int i = 1; i < num_many_lut; i++) {
|
||||
@@ -251,6 +252,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
// No need to sync, it is already synchronized before the first
|
||||
// sample_extract_body call
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
}
|
||||
@@ -409,7 +412,9 @@ __host__ void execute_step_one(
|
||||
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
|
||||
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
PANIC_IF_FALSE(sizeof(Torus) == 8,
|
||||
"Error: Programmable bootstrap step one only supports 64-bit "
|
||||
"Torus type.");
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
int thds = polynomial_size / params::opt;
|
||||
@@ -451,7 +456,9 @@ __host__ void execute_step_two(
|
||||
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
|
||||
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
PANIC_IF_FALSE(sizeof(Torus) == 8,
|
||||
"Error: Programmable bootstrap step two only supports 64-bit "
|
||||
"Torus type.");
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
int thds = polynomial_size / params::opt;
|
||||
|
||||
@@ -44,30 +44,31 @@ void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
|
||||
host_programmable_bootstrap_128<InputTorus, Degree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
|
||||
host_programmable_bootstrap_128<InputTorus, Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<1024>>(
|
||||
host_programmable_bootstrap_128<InputTorus, Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<2048>>(
|
||||
host_programmable_bootstrap_128<InputTorus, Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
// We use AmortizedDegree for 4096 to avoid register exhaustion
|
||||
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
@@ -91,30 +92,31 @@ void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<256>>(
|
||||
host_programmable_bootstrap_cg_128<InputTorus, Degree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<512>>(
|
||||
host_programmable_bootstrap_cg_128<InputTorus, Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<1024>>(
|
||||
host_programmable_bootstrap_cg_128<InputTorus, Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<2048>>(
|
||||
host_programmable_bootstrap_cg_128<InputTorus, Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
// We use AmortizedDegree for 4096 to avoid register exhaustion
|
||||
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
|
||||
@@ -240,10 +240,13 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
__syncthreads();
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
}
|
||||
} else {
|
||||
// Persist the updated accumulator
|
||||
// We don't sync here because we use same indexes to read from `accumulator`
|
||||
// as it was used in `add_to_torus_128` to write inside it Persist the
|
||||
// updated accumulator
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
global_slice[tid] = accumulator[tid];
|
||||
@@ -395,6 +398,7 @@ __global__ void device_programmable_bootstrap_cg_128(
|
||||
accumulator);
|
||||
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
__syncthreads();
|
||||
sample_extract_body<__uint128_t, params>(block_lwe_array_out, accumulator,
|
||||
0);
|
||||
}
|
||||
@@ -571,38 +575,35 @@ uint64_t scratch_cuda_programmable_bootstrap_128_vector(
|
||||
input_lwe_ciphertext_count, max_shared_memory)) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus,
|
||||
AmortizedDegree<256>>(
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus, Degree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory,
|
||||
noise_reduction_type);
|
||||
break;
|
||||
case 512:
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus,
|
||||
AmortizedDegree<512>>(
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus, Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory,
|
||||
noise_reduction_type);
|
||||
break;
|
||||
case 1024:
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus, Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory,
|
||||
noise_reduction_type);
|
||||
break;
|
||||
case 2048:
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus, Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory,
|
||||
noise_reduction_type);
|
||||
break;
|
||||
case 4096:
|
||||
// We use AmortizedDegree for 4096 to avoid register exhaustion
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
@@ -618,38 +619,35 @@ uint64_t scratch_cuda_programmable_bootstrap_128_vector(
|
||||
} else {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<256>>(
|
||||
return scratch_programmable_bootstrap_128<InputTorus, Degree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory,
|
||||
noise_reduction_type);
|
||||
break;
|
||||
case 512:
|
||||
return scratch_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<512>>(
|
||||
return scratch_programmable_bootstrap_128<InputTorus, Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory,
|
||||
noise_reduction_type);
|
||||
break;
|
||||
case 1024:
|
||||
return scratch_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
return scratch_programmable_bootstrap_128<InputTorus, Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory,
|
||||
noise_reduction_type);
|
||||
break;
|
||||
case 2048:
|
||||
return scratch_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
return scratch_programmable_bootstrap_128<InputTorus, Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory,
|
||||
noise_reduction_type);
|
||||
break;
|
||||
case 4096:
|
||||
// We use AmortizedDegree for 4096 to avoid register exhaustion
|
||||
return scratch_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
@@ -956,22 +954,19 @@ __host__ bool supports_cooperative_groups_on_programmable_bootstrap_128(
|
||||
uint32_t max_shared_memory) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return verify_cuda_programmable_bootstrap_128_cg_grid_size<
|
||||
AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
return verify_cuda_programmable_bootstrap_128_cg_grid_size<Degree<256>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 512:
|
||||
return verify_cuda_programmable_bootstrap_128_cg_grid_size<
|
||||
AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
return verify_cuda_programmable_bootstrap_128_cg_grid_size<Degree<512>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 1024:
|
||||
return verify_cuda_programmable_bootstrap_128_cg_grid_size<
|
||||
AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
return verify_cuda_programmable_bootstrap_128_cg_grid_size<Degree<1024>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 2048:
|
||||
return verify_cuda_programmable_bootstrap_128_cg_grid_size<
|
||||
AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
return verify_cuda_programmable_bootstrap_128_cg_grid_size<Degree<2048>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 4096:
|
||||
// We use AmortizedDegree for 4096 to avoid register exhaustion
|
||||
return verify_cuda_programmable_bootstrap_128_cg_grid_size<
|
||||
AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
|
||||
@@ -461,6 +461,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
}
|
||||
}
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
// No need to sync here, it is already synchronized after add_to_torus
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
|
||||
if (num_many_lut > 1) {
|
||||
for (int i = 1; i < num_many_lut; i++) {
|
||||
@@ -473,6 +474,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
// No need to sync here, it is already synchronized after add_to_torus
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
global_slice, 0, i * lut_stride);
|
||||
}
|
||||
@@ -663,7 +665,9 @@ __host__ void execute_compute_keybundle(
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
PANIC_IF_FALSE(sizeof(Torus) == 8,
|
||||
"Error: PBS keybundle only supports 64-bit "
|
||||
"Torus type.");
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
uint64_t chunk_size = std::min(
|
||||
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
|
||||
@@ -735,7 +739,10 @@ __host__ void execute_step_one(
|
||||
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count) {
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
PANIC_IF_FALSE(
|
||||
sizeof(Torus) == 8,
|
||||
"Error: Programmable bootstrap multi-bit step one only supports 64-bit "
|
||||
"Torus type.");
|
||||
uint64_t full_sm_accumulate_step_one =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
|
||||
polynomial_size);
|
||||
@@ -789,7 +796,10 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
uint32_t level_count, uint32_t j, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
PANIC_IF_FALSE(
|
||||
sizeof(Torus) == 8,
|
||||
"Error: Programmable bootstrap multi-bit step two only supports 64-bit "
|
||||
"Torus type.");
|
||||
uint32_t lwe_chunk_size = (uint32_t)(buffer->lwe_chunk_size);
|
||||
uint64_t full_sm_accumulate_step_two =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
|
||||
|
||||
@@ -11,29 +11,30 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128(
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<256>>(
|
||||
Degree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 512:
|
||||
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<512>>(
|
||||
Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 1024:
|
||||
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 2048:
|
||||
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 4096:
|
||||
// We use AmortizedDegree for 4096 to avoid register exhaustion
|
||||
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
@@ -55,30 +56,31 @@ uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap_128(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<
|
||||
InputTorus, AmortizedDegree<256>>(
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
Degree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 512:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<
|
||||
InputTorus, AmortizedDegree<512>>(
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 1024:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<
|
||||
InputTorus, AmortizedDegree<1024>>(
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 2048:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<
|
||||
InputTorus, AmortizedDegree<2048>>(
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 4096:
|
||||
// We use AmortizedDegree for 4096 to avoid register exhaustion
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<
|
||||
InputTorus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
@@ -97,7 +99,7 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
bool supports_cg =
|
||||
supports_cooperative_groups_on_multibit_programmable_bootstrap<
|
||||
supports_cooperative_groups_on_multibit_programmable_bootstrap_128<
|
||||
__uint128_t>(glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
@@ -129,7 +131,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus, Degree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
@@ -137,7 +139,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 512:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus, Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
@@ -145,8 +147,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 1024:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus, Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
@@ -154,8 +155,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 2048:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus, Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
@@ -163,6 +163,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 4096:
|
||||
// We use AmortizedDegree for 4096 to avoid register exhaustion
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
@@ -191,8 +192,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<256>>(
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus, Degree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
@@ -200,8 +200,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 512:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<512>>(
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus, Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
@@ -209,8 +208,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 1024:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus, Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
@@ -218,8 +216,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 2048:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus, Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
@@ -227,6 +224,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 4096:
|
||||
// We use AmortizedDegree for 4096 to avoid register exhaustion
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
|
||||
@@ -333,6 +333,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
}
|
||||
}
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
__syncthreads();
|
||||
sample_extract_body<__uint128_t, params>(block_lwe_array_out,
|
||||
global_slice, 0);
|
||||
if (num_many_lut > 1) {
|
||||
@@ -346,6 +347,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
// No need to sync, it is already synchronized before the first
|
||||
// sample_extract_body call
|
||||
sample_extract_body<__uint128_t, params>(
|
||||
next_block_lwe_array_out, global_slice, 0, i * lut_stride);
|
||||
}
|
||||
@@ -505,10 +508,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
}
|
||||
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
|
||||
__syncthreads();
|
||||
sample_extract_body<__uint128_t, params>(block_lwe_array_out,
|
||||
accumulator, 0);
|
||||
|
||||
if (num_many_lut > 1) {
|
||||
for (int i = 1; i < num_many_lut; i++) {
|
||||
|
||||
@@ -519,7 +521,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
// No need to sync, it is already synchronized before the first
|
||||
// sample_extract_body call
|
||||
sample_extract_body<__uint128_t, params>(
|
||||
next_block_lwe_array_out, accumulator, 0, i * lut_stride);
|
||||
}
|
||||
@@ -1090,4 +1093,109 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap_128(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
// Verify if the grid size satisfies the cooperative group constraints
|
||||
template <typename Torus, class params>
|
||||
__host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128(
|
||||
int glwe_dimension, int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
// If Cooperative Groups is not supported, no need to check anything else
|
||||
if (!cuda_check_support_cooperative_groups())
|
||||
return false;
|
||||
|
||||
// Calculate the dimension of the kernel
|
||||
uint64_t full_sm_cg_accumulate =
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
params::degree);
|
||||
uint64_t partial_sm_cg_accumulate =
|
||||
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
params::degree);
|
||||
|
||||
int thds = params::degree / params::opt;
|
||||
|
||||
// Get the maximum number of active blocks per streaming multiprocessors
|
||||
int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
|
||||
int max_active_blocks_per_sm;
|
||||
|
||||
if (max_shared_memory < partial_sm_cg_accumulate) {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
|
||||
Torus, params, NOSM>,
|
||||
thds, 0);
|
||||
} else if (max_shared_memory < full_sm_cg_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
|
||||
Torus, params, PARTIALSM>,
|
||||
thds, partial_sm_cg_accumulate);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
|
||||
Torus, params, FULLSM>,
|
||||
thds, full_sm_cg_accumulate);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
// Get the number of streaming multiprocessors
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
|
||||
}
|
||||
|
||||
// Verify if the grid size for the multi-bit kernel satisfies the cooperative
|
||||
// group constraints
|
||||
template <typename Torus>
|
||||
__host__ bool
|
||||
supports_cooperative_groups_on_multibit_programmable_bootstrap_128(
|
||||
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
|
||||
Torus, Degree<256>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
case 512:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
|
||||
Torus, Degree<512>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
case 1024:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
|
||||
Torus, Degree<1024>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
case 2048:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
|
||||
Torus, Degree<2048>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
case 4096:
|
||||
// We use AmortizedDegree for 4096 to avoid register exhaustion
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
|
||||
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
default:
|
||||
PANIC(
|
||||
"Cuda error (multi-bit PBS128): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
#endif // PROGRAMMABLE_BOOTSTRAP_MULTIBIT_128_CUH
|
||||
|
||||
@@ -179,6 +179,7 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
}
|
||||
}
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
__syncthreads();
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
|
||||
if (num_many_lut > 1) {
|
||||
@@ -191,13 +192,17 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
// No need to sync, it is already synchronized before the first
|
||||
// sample_extract_body call
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Before exiting the kernel we need to sync the cluster to ensure that
|
||||
// other blocks can still access the dsm in the mul ggsw glwe
|
||||
cluster.sync();
|
||||
}
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
@@ -207,8 +212,8 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
|
||||
uint32_t lwe_dimension, uint32_t num_many_lut, uint32_t lut_stride,
|
||||
const double2 *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t num_many_lut, uint32_t lut_stride,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
constexpr uint32_t level_count = 1;
|
||||
@@ -254,9 +259,6 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
|
||||
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
|
||||
(glwe_dimension + 1)];
|
||||
|
||||
double2 *block_join_buffer =
|
||||
&join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
|
||||
params::degree / 2];
|
||||
// Since the space is L1 cache is small, we use the same memory location for
|
||||
// the rotated accumulator and the fft accumulator, since we know that the
|
||||
// rotated array is not in use anymore by the time we perform the fft
|
||||
@@ -279,13 +281,14 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
|
||||
Torus temp_a_hat = 0;
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
|
||||
constexpr int WARP_SIZE = 32;
|
||||
// We calculate the modulus switch of a warp size of elements
|
||||
if (i % 32 == 0 && (i + threadIdx.x % 32) < lwe_dimension) {
|
||||
modulus_switch(block_lwe_array_in[i + threadIdx.x % 32], temp_a_hat,
|
||||
log_modulus);
|
||||
if (i % WARP_SIZE == 0 && (i + threadIdx.x % WARP_SIZE) < lwe_dimension) {
|
||||
modulus_switch(block_lwe_array_in[i + threadIdx.x % WARP_SIZE],
|
||||
temp_a_hat, log_modulus);
|
||||
}
|
||||
// each iteration we broadcast the corresponding ms previously calculated
|
||||
Torus a_hat = __shfl_sync(0xFFFFFFFF, temp_a_hat, i % 32);
|
||||
Torus a_hat = __shfl_sync(0xFFFFFFFF, temp_a_hat, i % WARP_SIZE);
|
||||
|
||||
__syncthreads();
|
||||
Torus reg_acc_rotated[params::opt];
|
||||
@@ -360,6 +363,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
|
||||
}
|
||||
}
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
// No need to sync here, it is already synchronized after add_to_torus
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
|
||||
if (num_many_lut > 1) {
|
||||
@@ -373,12 +377,16 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
// No need to sync here, it is already synchronized after add_to_torus
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Before exiting the kernel we need to sync the cluster to ensure that
|
||||
// other blocks can still access the dsm in the mul ggsw glwe or the
|
||||
// ping pong buffers
|
||||
cluster.sync();
|
||||
}
|
||||
|
||||
@@ -458,6 +466,9 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
PANIC_IF_FALSE(sizeof(Torus) == 8,
|
||||
"Error: Programmable bootstrap tbc only supports 64-bit "
|
||||
"Torus type.");
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto supports_dsm =
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
@@ -543,8 +554,8 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
&config,
|
||||
device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, num_many_lut, lut_stride, noise_reduction_type));
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, lwe_dimension,
|
||||
num_many_lut, lut_stride, noise_reduction_type));
|
||||
} else {
|
||||
config.dynamicSmemBytes = full_sm + minimum_sm_tbc;
|
||||
|
||||
|
||||
@@ -156,6 +156,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
}
|
||||
}
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
__syncthreads();
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
if (num_many_lut > 1) {
|
||||
for (int i = 1; i < num_many_lut; i++) {
|
||||
@@ -167,7 +168,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
// No need to sync, it is already synchronized before the first
|
||||
// sample_extract_body call
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
}
|
||||
@@ -179,6 +181,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
accumulator, global_accumulator_slice);
|
||||
}
|
||||
}
|
||||
// Before exiting the kernel we need to sync the cluster to ensure that
|
||||
// other blocks can still access the dsm in the mul ggsw glwe
|
||||
cluster.sync();
|
||||
}
|
||||
|
||||
// Specialized version for the multi-bit bootstrap using 2_2 params:
|
||||
@@ -356,6 +361,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
}
|
||||
}
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
// No need to sync here, it is already synchronized after add_to_torus
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
if (num_many_lut > 1) {
|
||||
for (int i = 1; i < num_many_lut; i++) {
|
||||
@@ -367,7 +373,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
// No need to sync here, it is already synchronized after
|
||||
// add_to_torus
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
}
|
||||
@@ -379,7 +386,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
reg_acc_rotated, global_accumulator_slice);
|
||||
}
|
||||
// Before exiting the kernel we need to sync the cluster to ensure that
|
||||
// that other blocks can still access the dsm in the ping pong buffer
|
||||
// other blocks can still access the dsm in the ping pong buffer
|
||||
cluster.sync();
|
||||
}
|
||||
|
||||
@@ -523,6 +530,11 @@ __host__ void execute_tbc_external_product_loop(
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
PANIC_IF_FALSE(
|
||||
sizeof(Torus) == 8,
|
||||
"Error: Programmable bootstrap multi-bit tbc only supports 64-bit "
|
||||
"Torus type.");
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
|
||||
@@ -3,8 +3,6 @@
|
||||
|
||||
#include "crypto/torus.cuh"
|
||||
|
||||
#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
|
||||
|
||||
#define CIRCULANT_BLOCKTILE 32
|
||||
// Make a circulant matrix that serves to multiply a polynomial
|
||||
// with another one. Each thread loads a part of the original
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user