mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-04-28 03:01:21 -04:00
Compare commits
7 Commits
tfhe-rs-1.
...
al/debug_l
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
592e2fcc89 | ||
|
|
611315255e | ||
|
|
44177c98f0 | ||
|
|
a1fde5bf18 | ||
|
|
c8e878fabe | ||
|
|
38d1842596 | ||
|
|
740d4697e7 |
1
.gitattributes
vendored
1
.gitattributes
vendored
@@ -1 +0,0 @@
|
||||
*.hpu filter=lfs diff=lfs merge=lfs -text
|
||||
@@ -67,7 +67,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
6
.github/workflows/aws_tfhe_fast_tests.yml
vendored
6
.github/workflows/aws_tfhe_fast_tests.yml
vendored
@@ -174,7 +174,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -182,11 +182,9 @@ jobs:
|
||||
if: needs.should-run.outputs.csprng_test == 'true'
|
||||
run: |
|
||||
make test_tfhe_csprng
|
||||
make test_tfhe_csprng_big_endian
|
||||
|
||||
- name: Run tfhe-zk-pok tests
|
||||
# Always run it to catch non deterministic bugs earlier
|
||||
# if: needs.should-run.outputs.zk_pok_test == 'true'
|
||||
if: needs.should-run.outputs.zk_pok_test == 'true'
|
||||
run: |
|
||||
make test_zk_pok
|
||||
|
||||
|
||||
2
.github/workflows/aws_tfhe_integer_tests.yml
vendored
2
.github/workflows/aws_tfhe_integer_tests.yml
vendored
@@ -114,7 +114,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
@@ -115,7 +115,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
2
.github/workflows/aws_tfhe_tests.yml
vendored
2
.github/workflows/aws_tfhe_tests.yml
vendored
@@ -185,7 +185,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
2
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
2
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
@@ -68,7 +68,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
2
.github/workflows/benchmark_boolean.yml
vendored
2
.github/workflows/benchmark_boolean.yml
vendored
@@ -68,7 +68,7 @@ jobs:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
|
||||
2
.github/workflows/benchmark_core_crypto.yml
vendored
2
.github/workflows/benchmark_core_crypto.yml
vendored
@@ -68,7 +68,7 @@ jobs:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
|
||||
22
.github/workflows/benchmark_dex.yml
vendored
22
.github/workflows/benchmark_dex.yml
vendored
@@ -68,7 +68,7 @@ jobs:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -98,27 +98,15 @@ jobs:
|
||||
env:
|
||||
REF_NAME: ${{ github.ref_name }}
|
||||
|
||||
- name: Parse swap request update PBS counts
|
||||
- name: Parse swap request PBS counts
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_request_update_dex_balance_pbs_count.csv "${RESULTS_FILENAME}" \
|
||||
python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_request_pbs_count.csv "${RESULTS_FILENAME}" \
|
||||
--object-sizes \
|
||||
--append-results
|
||||
|
||||
- name: Parse swap request finalize PBS counts
|
||||
- name: Parse swap claim PBS counts
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_request_finalize_pbs_count.csv "${RESULTS_FILENAME}" \
|
||||
--object-sizes \
|
||||
--append-results
|
||||
|
||||
- name: Parse swap claim prepare PBS counts
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_claim_prepare_pbs_count.csv "${RESULTS_FILENAME}" \
|
||||
--object-sizes \
|
||||
--append-results
|
||||
|
||||
- name: Parse swap claim update PBS counts
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_claim_update_dex_balance_pbs_count.csv "${RESULTS_FILENAME}" \
|
||||
python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_claim_pbs_count.csv "${RESULTS_FILENAME}" \
|
||||
--object-sizes \
|
||||
--append-results
|
||||
|
||||
|
||||
2
.github/workflows/benchmark_erc20.yml
vendored
2
.github/workflows/benchmark_erc20.yml
vendored
@@ -69,7 +69,7 @@ jobs:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
|
||||
7
.github/workflows/benchmark_gpu_4090.yml
vendored
7
.github/workflows/benchmark_gpu_4090.yml
vendored
@@ -57,7 +57,7 @@ jobs:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -140,7 +140,7 @@ jobs:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -168,8 +168,7 @@ jobs:
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${BENCH_DATE}" \
|
||||
--walk-subdirs \
|
||||
env:
|
||||
REF_NAME: ${{ github.ref_name }}
|
||||
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
|
||||
|
||||
26
.github/workflows/benchmark_gpu_common.yml
vendored
26
.github/workflows/benchmark_gpu_common.yml
vendored
@@ -122,23 +122,31 @@ jobs:
|
||||
|
||||
- name: Set command output
|
||||
id: set_command
|
||||
run: | # zizmor: ignore[template-injection] this env variable is safe
|
||||
echo "command=${{ toJSON(env.COMMAND) }}" >> "${GITHUB_OUTPUT}"
|
||||
run: |
|
||||
echo "command=${COMMAND_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
COMMAND_OUTPUT: ${{ toJSON(env.COMMAND) }}
|
||||
|
||||
- name: Set operation flavor output
|
||||
id: set_op_flavor
|
||||
run: | # zizmor: ignore[template-injection] this env variable is safe
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
|
||||
run: |
|
||||
echo "op_flavor=${OP_FLAVOR_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
OP_FLAVOR_OUTPUT: ${{ toJSON(env.OP_FLAVOR) }}
|
||||
|
||||
- name: Set benchmark types output
|
||||
id: set_bench_type
|
||||
run: | # zizmor: ignore[template-injection] this env variable is safe
|
||||
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
|
||||
run: |
|
||||
echo "bench_type=${BENCH_TYPE_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
BENCH_TYPE_OUTPUT: ${{ toJSON(env.BENCH_TYPE) }}
|
||||
|
||||
- name: Set parameters types output
|
||||
id: set_params_type
|
||||
run: | # zizmor: ignore[template-injection] this env variable is safe
|
||||
echo "params_type=${{ toJSON(env.PARAMS_TYPE) }}" >> "${GITHUB_OUTPUT}"
|
||||
run: |
|
||||
echo "params_type=${PARAMS_TYPE_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
PARAMS_TYPE_OUTPUT: ${{ toJSON(env.PARAMS_TYPE) }}
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-${{ inputs.profile }}-benchmarks)
|
||||
@@ -269,7 +277,7 @@ jobs:
|
||||
GCC_VERSION: ${{ matrix.gcc }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
|
||||
@@ -129,7 +129,7 @@ jobs:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
|
||||
@@ -130,7 +130,7 @@ jobs:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
|
||||
4
.github/workflows/benchmark_hpu_integer.yml
vendored
4
.github/workflows/benchmark_hpu_integer.yml
vendored
@@ -33,7 +33,6 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
lfs: true
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
@@ -48,7 +47,7 @@ jobs:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -62,7 +61,6 @@ jobs:
|
||||
|
||||
- name: Run benchmarks
|
||||
run: |
|
||||
git lfs pull --include="*" --exclude=""
|
||||
make bench_integer_hpu
|
||||
make bench_hlapi_erc20_hpu
|
||||
|
||||
|
||||
14
.github/workflows/benchmark_integer.yml
vendored
14
.github/workflows/benchmark_integer.yml
vendored
@@ -78,13 +78,17 @@ jobs:
|
||||
|
||||
- name: Set operation flavor output
|
||||
id: set_op_flavor
|
||||
run: | # zizmor: ignore[template-injection] this env variable is safe
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
|
||||
run: |
|
||||
echo "op_flavor=${OP_FLAVOR_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
OP_FLAVOR_OUTPUT: ${{ toJSON(env.OP_FLAVOR) }}
|
||||
|
||||
- name: Set benchmark types output
|
||||
id: set_bench_type
|
||||
run: | # zizmor: ignore[template-injection] this env variable is safe
|
||||
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
|
||||
run: |
|
||||
echo "bench_type=${BENCH_TYPE_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
BENCH_TYPE_OUTPUT: ${{ toJSON(env.BENCH_TYPE) }}
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (integer-benchmarks)
|
||||
@@ -138,7 +142,7 @@ jobs:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
|
||||
8
.github/workflows/benchmark_shortint.yml
vendored
8
.github/workflows/benchmark_shortint.yml
vendored
@@ -47,8 +47,10 @@ jobs:
|
||||
|
||||
- name: Set operation flavor output
|
||||
id: set_op_flavor
|
||||
run: | # zizmor: ignore[template-injection] this env variable is safe
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
|
||||
run: |
|
||||
echo "op_flavor=${OP_FLAVOR_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
OP_FLAVOR_OUTPUT: ${{ toJSON(env.OP_FLAVOR) }}
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (shortint-benchmarks)
|
||||
@@ -99,7 +101,7 @@ jobs:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
|
||||
14
.github/workflows/benchmark_signed_integer.yml
vendored
14
.github/workflows/benchmark_signed_integer.yml
vendored
@@ -78,13 +78,17 @@ jobs:
|
||||
|
||||
- name: Set operation flavor output
|
||||
id: set_op_flavor
|
||||
run: | # zizmor: ignore[template-injection] this env variable is safe
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
|
||||
run: |
|
||||
echo "op_flavor=${OP_FLAVOR_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
OP_FLAVOR_OUTPUT: ${{ toJSON(env.OP_FLAVOR) }}
|
||||
|
||||
- name: Set benchmark types output
|
||||
id: set_bench_type
|
||||
run: | # zizmor: ignore[template-injection] this env variable is safe
|
||||
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
|
||||
run: |
|
||||
echo "bench_type=${BENCH_TYPE_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
BENCH_TYPE_OUTPUT: ${{ toJSON(env.BENCH_TYPE) }}
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (signed-integer-benchmarks)
|
||||
@@ -138,7 +142,7 @@ jobs:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
|
||||
4
.github/workflows/benchmark_tfhe_zk_pok.yml
vendored
4
.github/workflows/benchmark_tfhe_zk_pok.yml
vendored
@@ -108,7 +108,7 @@ jobs:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -144,7 +144,7 @@ jobs:
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
|
||||
with:
|
||||
name: ${{ github.sha }}_tfhe_zk_pok_${{ env.BENCH_TYPE }}
|
||||
name: ${{ github.sha }}_tfhe_zk_pok
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
|
||||
2
.github/workflows/benchmark_wasm_client.yml
vendored
2
.github/workflows/benchmark_wasm_client.yml
vendored
@@ -106,7 +106,7 @@ jobs:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
|
||||
10
.github/workflows/benchmark_zk_pke.yml
vendored
10
.github/workflows/benchmark_zk_pke.yml
vendored
@@ -92,8 +92,10 @@ jobs:
|
||||
|
||||
- name: Set benchmark types output
|
||||
id: set_bench_type
|
||||
run: | # zizmor: ignore[template-injection] this env variable is safe
|
||||
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
|
||||
run: |
|
||||
echo "bench_type=${BENCH_TYPE_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
BENCH_TYPE_OUTPUT: ${{ toJSON(env.BENCH_TYPE) }}
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (pke-zk-benchmarks)
|
||||
@@ -150,7 +152,7 @@ jobs:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -194,7 +196,7 @@ jobs:
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
|
||||
with:
|
||||
name: ${{ github.sha }}_integer_zk_${{ matrix.bench_type }}
|
||||
name: ${{ github.sha }}_integer_zk
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
|
||||
10
.github/workflows/cargo_build.yml
vendored
10
.github/workflows/cargo_build.yml
vendored
@@ -35,7 +35,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -49,14 +49,6 @@ jobs:
|
||||
mv linelint-linux-amd64 /usr/local/bin/linelint
|
||||
make check_newline
|
||||
|
||||
# This is needed for the ws tests clippy checks
|
||||
- name: Use specific data branch
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
|
||||
env:
|
||||
PR_BRANCH: ${{ github.head_ref || github.ref_name }}
|
||||
run: |
|
||||
echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run pcc checks
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
|
||||
7
.github/workflows/check_commit.yml
vendored
7
.github/workflows/check_commit.yml
vendored
@@ -3,15 +3,14 @@ name: Check commit and PR compliance
|
||||
on:
|
||||
pull_request:
|
||||
|
||||
permissions: {}
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: read # Permission needed to scan commits in a pull-request
|
||||
|
||||
jobs:
|
||||
check-commit-pr:
|
||||
name: Check commit and PR
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write # Permission needed to scan commits in a pull-request and write issue comment
|
||||
steps:
|
||||
- name: Check first line
|
||||
uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee
|
||||
|
||||
2
.github/workflows/code_coverage.yml
vendored
2
.github/workflows/code_coverage.yml
vendored
@@ -54,7 +54,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
@@ -66,7 +66,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
115
.github/workflows/data_pr_close.yml
vendored
115
.github/workflows/data_pr_close.yml
vendored
@@ -3,7 +3,7 @@ name: Close or Merge corresponding PR on the data repo
|
||||
# When a PR with the data_PR tag is closed or merged, this will close the corresponding PR in the data repo.
|
||||
|
||||
env:
|
||||
DATA_REPO: zama-ai/tfhe-backward-compat-data
|
||||
TARGET_REPO_API_URL: ${{ github.api_url }}/repos/zama-ai/tfhe-backward-compat-data
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
@@ -16,43 +16,122 @@ on:
|
||||
pull_request:
|
||||
types: [ closed ]
|
||||
|
||||
# The same pattern is used for jobs that use the github api:
|
||||
# - save the result of the API call in the env var "GH_API_RES". Since the var is multiline
|
||||
# we use this trick: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-of-a-multiline-string
|
||||
# - "set +e" will make sure we reach the last "echo EOF" even in case of error
|
||||
# - "set -o" pipefail makes one line piped command return the error of the first failure
|
||||
# - 'RES="$?"' and 'exit $RES' are used to return the error code if a command failed. Without it, with "set +e"
|
||||
# the script will always return 0 because of the "echo EOF".
|
||||
|
||||
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
auto_close_job:
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') && github.repository == 'zama-ai/tfhe-rs' }}
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }} # Needed for gh CLI commands
|
||||
steps:
|
||||
- name: Fetch PR number
|
||||
- name: Find corresponding Pull Request in the data repo
|
||||
run: |
|
||||
PR_NUMBER=$(gh pr view "${PR_BRANCH}" --repo "${DATA_REPO}" --json number | jq '.number')
|
||||
echo "DATA_REPO_PR_NUMBER=${PR_NUMBER}" >> "${GITHUB_ENV}"
|
||||
{
|
||||
set +e
|
||||
set -o pipefail
|
||||
echo 'TARGET_REPO_PR<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X GET \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
"${TARGET_REPO_API_URL}"/pulls\?head="${REPO_OWNER}":"${PR_BRANCH}" | jq -e '.[0]' | sed 's/null/{ "message": "corresponding PR not found" }/'
|
||||
RES="$?"
|
||||
echo EOF
|
||||
} >> "${GITHUB_ENV}"
|
||||
exit $RES
|
||||
env:
|
||||
REPO_OWNER: ${{ github.repository_owner }}
|
||||
|
||||
- name: Comment on the PR to indicate the reason of the close
|
||||
run: |
|
||||
gh pr comment "${PR_BRANCH}" \
|
||||
--repo "${DATA_REPO}" \
|
||||
--body "PR ${CLOSE_TYPE}d because the corresponding PR in main repo was ${CLOSE_TYPE}d: ${REPO}#${EVENT_NUMBER}"
|
||||
BODY="'{ \"body\": \"PR ${CLOSE_TYPE}d because the corresponding PR in main repo was ${CLOSE_TYPE}d: ${REPO}#${EVENT_NUMBER}\" }'"
|
||||
{
|
||||
set +e
|
||||
set -o pipefail
|
||||
echo 'GH_API_RES<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X POST \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${TOKEN}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
"${COMMENTS_URL}" \
|
||||
-d "${BODY}"
|
||||
RES="$?"
|
||||
echo EOF
|
||||
} >> "${GITHUB_ENV}"
|
||||
exit $RES
|
||||
env:
|
||||
REPO: ${{ github.repository }}
|
||||
EVENT_NUMBER: ${{ github.event.number }}
|
||||
COMMENTS_URL: ${{ fromJson(env.TARGET_REPO_PR).comments_url }}
|
||||
TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Merge the Pull Request in the data repo
|
||||
if: ${{ github.event.pull_request.merged }}
|
||||
run: |
|
||||
gh pr merge "${PR_BRANCH}" \
|
||||
--repo "${DATA_REPO}" \
|
||||
--rebase \
|
||||
--delete-branch
|
||||
{
|
||||
set +e
|
||||
set -o pipefail
|
||||
echo 'GH_API_RES<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X PUT \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${TOKEN}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
"${TARGET_REPO_PR_URL}"/merge \
|
||||
-d '{ "merge_method": "rebase" }'
|
||||
RES="$?"
|
||||
echo EOF
|
||||
} >> "${GITHUB_ENV}"
|
||||
exit $RES
|
||||
env:
|
||||
TARGET_REPO_PR_URL: ${{ fromJson(env.TARGET_REPO_PR).url }}
|
||||
TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Close the Pull Request in the data repo
|
||||
if: ${{ !github.event.pull_request.merged }}
|
||||
run: |
|
||||
gh pr close "${PR_BRANCH}" \
|
||||
--repo "${DATA_REPO}" \
|
||||
--delete-branch
|
||||
{
|
||||
set +e
|
||||
set -o pipefail
|
||||
echo 'GH_API_RES<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X PATCH \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${TOKEN}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
"${TARGET_REPO_PR_URL}" \
|
||||
-d '{ "state": "closed" }'
|
||||
RES="$?"
|
||||
echo EOF
|
||||
} >> "${GITHUB_ENV}"
|
||||
exit $RES
|
||||
env:
|
||||
TARGET_REPO_PR_URL: ${{ fromJson(env.TARGET_REPO_PR).url }}
|
||||
TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Delete the associated branch in the data repo
|
||||
run: |
|
||||
{
|
||||
set +e
|
||||
set -o pipefail
|
||||
echo 'GH_API_RES<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X DELETE \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${TOKEN}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
"${TARGET_REPO_API_URL}"/git/refs/heads/"${PR_BRANCH}"
|
||||
RES="$?"
|
||||
echo EOF
|
||||
} >> "${GITHUB_ENV}"
|
||||
exit $RES
|
||||
env:
|
||||
TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() && job.status == 'failure' }}
|
||||
@@ -60,4 +139,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: https://github.com/${{ env.DATA_REPO }}/pull/${{ env.DATA_REPO_PR_NUMBER }}"
|
||||
SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: ${{ fromJson(env.GH_API_RES || env.TARGET_REPO_PR).message }}"
|
||||
|
||||
2
.github/workflows/gpu_4090_tests.yml
vendored
2
.github/workflows/gpu_4090_tests.yml
vendored
@@ -45,7 +45,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
2
.github/workflows/gpu_fast_h100_tests.yml
vendored
2
.github/workflows/gpu_fast_h100_tests.yml
vendored
@@ -140,7 +140,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
2
.github/workflows/gpu_fast_tests.yml
vendored
2
.github/workflows/gpu_fast_tests.yml
vendored
@@ -124,7 +124,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
2
.github/workflows/gpu_full_h100_tests.yml
vendored
2
.github/workflows/gpu_full_h100_tests.yml
vendored
@@ -79,7 +79,7 @@ jobs:
|
||||
gcc-version: ${{ matrix.gcc }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
@@ -72,7 +72,7 @@ jobs:
|
||||
gcc-version: ${{ matrix.gcc }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
2
.github/workflows/gpu_pcc.yml
vendored
2
.github/workflows/gpu_pcc.yml
vendored
@@ -94,7 +94,7 @@ jobs:
|
||||
CUDA_VERSION: ${{ matrix.cuda }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
@@ -140,7 +140,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
@@ -130,7 +130,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
@@ -140,7 +140,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
@@ -130,7 +130,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
2
.github/workflows/integer_long_run_tests.yml
vendored
2
.github/workflows/integer_long_run_tests.yml
vendored
@@ -57,7 +57,7 @@ jobs:
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
2
.github/workflows/m1_tests.yml
vendored
2
.github/workflows/m1_tests.yml
vendored
@@ -46,7 +46,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
7
.github/workflows/make_release_cuda.yml
vendored
7
.github/workflows/make_release_cuda.yml
vendored
@@ -67,7 +67,7 @@ jobs:
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -93,9 +93,6 @@ jobs:
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
GCC_VERSION: ${{ matrix.gcc }}
|
||||
|
||||
- name: Prepare package
|
||||
run: |
|
||||
cargo package -p tfhe-cuda-backend
|
||||
@@ -134,7 +131,7 @@ jobs:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
steps:
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -40,6 +40,3 @@ __pycache__
|
||||
# First directive is to ignore symlinks
|
||||
tests/tfhe-backward-compat-data
|
||||
ci/
|
||||
|
||||
# In case someone clones the lattice-estimator locally to verify security
|
||||
/lattice-estimator
|
||||
|
||||
14
CODEOWNERS
14
CODEOWNERS
@@ -1,28 +1,18 @@
|
||||
# Specifying a path without code owners means that path won't have owners and is akin to a negation
|
||||
# i.e. the `core_crypto` dir is owned and needs owner approval/review, but not the `gpu` sub dir
|
||||
# See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners#example-of-a-codeowners-file
|
||||
|
||||
/backends/tfhe-cuda-backend/ @agnesLeroy
|
||||
/backends/tfhe-hpu-backend/ @zama-ai/hardware
|
||||
|
||||
/tfhe/examples/hpu @zama-ai/hardware
|
||||
|
||||
/tfhe/src/core_crypto/ @IceTDrinker
|
||||
/tfhe/src/core_crypto/gpu @agnesLeroy
|
||||
/tfhe/src/core_crypto/hpu @zama-ai/hardware
|
||||
/tfhe/src/core_crypto/gpu
|
||||
|
||||
/tfhe/src/shortint/ @mayeul-zama
|
||||
|
||||
/tfhe/src/integer/ @tmontaigu
|
||||
/tfhe/src/integer/gpu @agnesLeroy
|
||||
/tfhe/src/integer/hpu @zama-ai/hardware
|
||||
/tfhe/src/integer/gpu
|
||||
|
||||
/tfhe/src/high_level_api/ @tmontaigu
|
||||
|
||||
/Makefile @IceTDrinker @soonum
|
||||
|
||||
/mockups/tfhe-hpu-mockup @zama-ai/hardware
|
||||
|
||||
/.github/ @soonum
|
||||
|
||||
/CODEOWNERS @IceTDrinker
|
||||
|
||||
51
Makefile
51
Makefile
@@ -23,20 +23,9 @@ BENCH_PARAM_TYPE?=classical
|
||||
BENCH_PARAMS_SET?=default
|
||||
NODE_VERSION=22.6
|
||||
BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
|
||||
BACKWARD_COMPAT_DATA_DEFAULT_BRANCH:=$(shell ./scripts/backward_compat_data_version.py)
|
||||
BACKWARD_COMPAT_DATA_BRANCH?=$(BACKWARD_COMPAT_DATA_DEFAULT_BRANCH)
|
||||
BACKWARD_COMPAT_DATA_BRANCH?=$(shell ./scripts/backward_compat_data_version.py)
|
||||
BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
|
||||
BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
|
||||
ifeq ($(BACKWARD_COMPAT_DATA_DEFAULT_BRANCH), $(BACKWARD_COMPAT_DATA_BRANCH))
|
||||
BACKWARD_COMPAT_CLIPPY_PATCH=
|
||||
else
|
||||
# We need to override the url for cargo patch accept it, see: https://github.com/rust-lang/cargo/issues/5478
|
||||
BACKWARD_COMPAT_PATCHED_URL=https://www.github.com/zama-ai/tfhe-backward-compat-data.git
|
||||
BACKWARD_COMPAT_CLIPPY_PATCH=\
|
||||
--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).branch=\"$(BACKWARD_COMPAT_DATA_BRANCH)\"" \
|
||||
--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).git=\"$(BACKWARD_COMPAT_PATCHED_URL)\""
|
||||
endif
|
||||
|
||||
TFHE_SPEC:=tfhe
|
||||
WASM_PACK_VERSION="0.13.1"
|
||||
# We are kind of hacking the cut here, the version cannot contain a quote '"'
|
||||
@@ -181,13 +170,9 @@ install_typos_checker: install_rs_build_toolchain
|
||||
.PHONY: install_zizmor # Install zizmor workflow security checker
|
||||
install_zizmor: install_rs_build_toolchain
|
||||
@zizmor --version > /dev/null 2>&1 || \
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor --version ~1.9 || \
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor || \
|
||||
( echo "Unable to install zizmor, unknown error." && exit 1 )
|
||||
|
||||
.PHONY: install_cargo_cross # Install custom tfhe-rs lints
|
||||
install_cargo_cross: install_rs_build_toolchain
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cross
|
||||
|
||||
.PHONY: setup_venv # Setup Python virtualenv for wasm tests
|
||||
setup_venv:
|
||||
python3 -m venv venv
|
||||
@@ -453,7 +438,6 @@ clippy_trivium: install_rs_check_toolchain
|
||||
.PHONY: clippy_ws_tests # Run clippy on the workspace level tests
|
||||
clippy_ws_tests: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --tests \
|
||||
$(BACKWARD_COMPAT_CLIPPY_PATCH) \
|
||||
-p tests --features=shortint,integer,zk-pok -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
|
||||
@@ -474,8 +458,6 @@ clippy_tfhe_csprng: install_rs_check_toolchain
|
||||
clippy_zk_pok: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-zk-pok -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-zk-pok --features=experimental -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_versionable # Run clippy lints on tfhe-versionable
|
||||
clippy_versionable: install_rs_check_toolchain
|
||||
@@ -514,12 +496,6 @@ clippy_hpu_backend: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-hpu-backend -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_hpu_mockup # Run clippy lints on tfhe-hpu-mockup
|
||||
clippy_hpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--all-targets \
|
||||
-p tfhe-hpu-backend -- --no-deps -D warnings
|
||||
|
||||
.PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend
|
||||
check_rust_bindings_did_not_change:
|
||||
cargo build -p tfhe-cuda-backend && "$(MAKE)" fmt_gpu && \
|
||||
@@ -532,9 +508,6 @@ check_rust_bindings_did_not_change:
|
||||
tfhe_lints: install_cargo_dylint
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo dylint --all -p tfhe --no-deps -- \
|
||||
--features=boolean,shortint,integer,strings,zk-pok
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo dylint --all -p tfhe-zk-pok --no-deps -- \
|
||||
--features=experimental
|
||||
|
||||
|
||||
.PHONY: build_core # Build core_crypto without experimental features
|
||||
build_core: install_rs_build_toolchain install_rs_check_toolchain
|
||||
@@ -676,14 +649,6 @@ test_integer_gpu: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
|
||||
--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
|
||||
|
||||
.PHONY: test_integer_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
|
||||
test_integer_gpu_debug: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile release_lto_off \
|
||||
--features=integer,gpu-debug -vv -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=1 --nocapture
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile release_lto_off \
|
||||
--features=integer,gpu-debug -p $(TFHE_SPEC) -- integer::gpu::server_key::
|
||||
|
||||
|
||||
.PHONY: test_integer_long_run_gpu # Run the long run integer tests on the gpu backend
|
||||
test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
@@ -1015,16 +980,10 @@ test_tfhe_csprng: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-csprng
|
||||
|
||||
.PHONY: test_tfhe_csprng_big_endian # Run tfhe-csprng tests on an emulated big endian system
|
||||
test_tfhe_csprng_big_endian: install_rs_build_toolchain install_cargo_cross
|
||||
RUSTFLAGS="" cross $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-csprng --target=powerpc64-unknown-linux-gnu
|
||||
|
||||
|
||||
.PHONY: test_zk_pok # Run tfhe-zk-pok tests
|
||||
test_zk_pok: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-zk-pok --features experimental
|
||||
-p tfhe-zk-pok
|
||||
|
||||
.PHONY: test_zk_wasm_x86_compat_ci
|
||||
test_zk_wasm_x86_compat_ci: check_nvm_installed
|
||||
@@ -1520,7 +1479,7 @@ parse_wasm_benchmarks: install_rs_check_toolchain
|
||||
.PHONY: write_params_to_file # Gather all crypto parameters into a file with a Sage readable format.
|
||||
write_params_to_file: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run \
|
||||
--example write_params_to_file --features=boolean,shortint,hpu,internal-keycache
|
||||
--example write_params_to_file --features=boolean,shortint,internal-keycache
|
||||
|
||||
.PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
|
||||
clone_backward_compat_data:
|
||||
@@ -1560,7 +1519,7 @@ pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
|
||||
clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu
|
||||
|
||||
.PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation
|
||||
pcc_hpu: clippy_hpu clippy_hpu_backend clippy_hpu_mockup test_integer_hpu_mockup_ci_fast
|
||||
pcc_hpu: clippy_hpu clippy_hpu_backend test_integer_hpu_mockup_ci_fast
|
||||
|
||||
.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
|
||||
fpcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
|
||||
|
||||
@@ -18,7 +18,6 @@
|
||||
<a href="https://github.com/zama-ai/tfhe-rs/releases"><img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square"></a>
|
||||
<a href="LICENSE"><img src="https://img.shields.io/badge/License-BSD--3--Clause--Clear-%23ffb243?style=flat-square"></a>
|
||||
<a href="https://github.com/zama-ai/bounty-program"><img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-%23ffd208?style=flat-square"></a>
|
||||
<a href="https://slsa.dev"><img alt="SLSA 3" src="https://slsa.dev/images/gh-badge-level3.svg" /></a>
|
||||
</p>
|
||||
|
||||
## About
|
||||
|
||||
@@ -129,7 +129,7 @@ Other sizes than 64 bit are expected to be available in the future.
|
||||
|
||||
# FHE shortint Trivium implementation
|
||||
|
||||
The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
|
||||
The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
|
||||
It uses a lower level API of tfhe-rs, so the syntax is a little bit different. It also implements the `TransCiphering` trait. For optimization purposes, it does not internally run
|
||||
on the same cryptographic parameters as the high level API of tfhe-rs. As such, it requires the usage of a casting key, to switch from one parameter space to another, which makes
|
||||
its setup a little more intricate.
|
||||
@@ -137,10 +137,10 @@ its setup a little more intricate.
|
||||
Example code:
|
||||
```rust
|
||||
use tfhe::shortint::prelude::*;
|
||||
use tfhe::shortint::parameters::current_params::{
|
||||
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
use tfhe::shortint::parameters::v1_2::{
|
||||
V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
};
|
||||
use tfhe::{ConfigBuilder, generate_keys, FheUint64};
|
||||
use tfhe::prelude::*;
|
||||
@@ -148,17 +148,17 @@ use tfhe_trivium::TriviumStreamShortint;
|
||||
|
||||
fn test_shortint() {
|
||||
let config = ConfigBuilder::default()
|
||||
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
|
||||
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
let ksk = KeySwitchingKey::new(
|
||||
(&client_key, Some(&server_key)),
|
||||
(&underlying_ck, &underlying_sk),
|
||||
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
|
||||
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
|
||||
);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB".to_string();
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use criterion::Criterion;
|
||||
use tfhe::prelude::*;
|
||||
use tfhe::shortint::parameters::current_params::{
|
||||
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
|
||||
use tfhe::shortint::parameters::v1_2::{
|
||||
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
|
||||
};
|
||||
use tfhe::shortint::prelude::*;
|
||||
use tfhe::{generate_keys, ConfigBuilder, FheUint64};
|
||||
@@ -11,19 +11,19 @@ use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};
|
||||
|
||||
pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::default()
|
||||
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
|
||||
let (client_key, server_key): (ClientKey, ServerKey) =
|
||||
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
|
||||
let ksk = KeySwitchingKey::new(
|
||||
(&client_key, Some(&server_key)),
|
||||
(&underlying_ck, &underlying_sk),
|
||||
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
@@ -64,19 +64,19 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
|
||||
|
||||
pub fn kreyvium_shortint_gen(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::default()
|
||||
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
|
||||
let (client_key, server_key): (ClientKey, ServerKey) =
|
||||
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
|
||||
let ksk = KeySwitchingKey::new(
|
||||
(&client_key, Some(&server_key)),
|
||||
(&underlying_ck, &underlying_sk),
|
||||
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
@@ -112,19 +112,19 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {
|
||||
|
||||
pub fn kreyvium_shortint_trans(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::default()
|
||||
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
|
||||
let (client_key, server_key): (ClientKey, ServerKey) =
|
||||
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
|
||||
let ksk = KeySwitchingKey::new(
|
||||
(&client_key, Some(&server_key)),
|
||||
(&underlying_ck, &underlying_sk),
|
||||
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use criterion::Criterion;
|
||||
use tfhe::prelude::*;
|
||||
use tfhe::shortint::parameters::current_params::{
|
||||
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
|
||||
use tfhe::shortint::parameters::v1_2::{
|
||||
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
|
||||
};
|
||||
use tfhe::shortint::prelude::*;
|
||||
use tfhe::{generate_keys, ConfigBuilder, FheUint64};
|
||||
@@ -11,19 +11,19 @@ use tfhe_trivium::{TransCiphering, TriviumStreamShortint};
|
||||
|
||||
pub fn trivium_shortint_warmup(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::default()
|
||||
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
|
||||
let (client_key, server_key): (ClientKey, ServerKey) =
|
||||
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
|
||||
let ksk = KeySwitchingKey::new(
|
||||
(&client_key, Some(&server_key)),
|
||||
(&underlying_ck, &underlying_sk),
|
||||
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB".to_string();
|
||||
@@ -64,19 +64,19 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {
|
||||
|
||||
pub fn trivium_shortint_gen(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::default()
|
||||
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
|
||||
let (client_key, server_key): (ClientKey, ServerKey) =
|
||||
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
|
||||
let ksk = KeySwitchingKey::new(
|
||||
(&client_key, Some(&server_key)),
|
||||
(&underlying_ck, &underlying_sk),
|
||||
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB".to_string();
|
||||
@@ -112,19 +112,19 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {
|
||||
|
||||
pub fn trivium_shortint_trans(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::default()
|
||||
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
|
||||
let (client_key, server_key): (ClientKey, ServerKey) =
|
||||
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
|
||||
let ksk = KeySwitchingKey::new(
|
||||
(&client_key, Some(&server_key)),
|
||||
(&underlying_ck, &underlying_sk),
|
||||
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB".to_string();
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
|
||||
use tfhe::prelude::*;
|
||||
use tfhe::shortint::parameters::current_params::{
|
||||
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
|
||||
use tfhe::shortint::parameters::v1_2::{
|
||||
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
|
||||
};
|
||||
use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
|
||||
// Values for these tests come from the github repo renaud1239/Kreyvium,
|
||||
@@ -221,19 +221,19 @@ use tfhe::shortint::prelude::*;
|
||||
#[test]
|
||||
fn kreyvium_test_shortint_long() {
|
||||
let config = ConfigBuilder::default()
|
||||
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
|
||||
let (client_key, server_key): (ClientKey, ServerKey) =
|
||||
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
|
||||
let ksk = KeySwitchingKey::new(
|
||||
(&client_key, Some(&server_key)),
|
||||
(&underlying_ck, &underlying_sk),
|
||||
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
|
||||
use tfhe::prelude::*;
|
||||
use tfhe::shortint::parameters::current_params::{
|
||||
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
|
||||
use tfhe::shortint::parameters::v1_2::{
|
||||
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
|
||||
};
|
||||
use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
|
||||
// Values for these tests come from the github repo cantora/avr-crypto-lib, commit 2a5b018,
|
||||
@@ -357,19 +357,19 @@ use tfhe::shortint::prelude::*;
|
||||
#[test]
|
||||
fn trivium_test_shortint_long() {
|
||||
let config = ConfigBuilder::default()
|
||||
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
|
||||
.build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
|
||||
let (client_key, server_key): (ClientKey, ServerKey) =
|
||||
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
|
||||
|
||||
let ksk = KeySwitchingKey::new(
|
||||
(&client_key, Some(&server_key)),
|
||||
(&underlying_ck, &underlying_sk),
|
||||
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
|
||||
);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB".to_string();
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tfhe-cuda-backend"
|
||||
version = "0.11.0"
|
||||
version = "0.10.0"
|
||||
edition = "2021"
|
||||
authors = ["Zama team"]
|
||||
license = "BSD-3-Clause-Clear"
|
||||
@@ -19,4 +19,3 @@ bindgen = "0.71"
|
||||
[features]
|
||||
experimental-multi-arch = []
|
||||
profile = []
|
||||
debug = []
|
||||
|
||||
@@ -53,11 +53,6 @@ fn main() {
|
||||
cmake_config.define("USE_NVTOOLS", "OFF");
|
||||
}
|
||||
|
||||
if cfg!(feature = "debug") {
|
||||
cmake_config.define("CMAKE_BUILD_TYPE", "DEBUG");
|
||||
cmake_config.define("CMAKE_CXX_FLAGS", "-Wuninitialized -O0");
|
||||
}
|
||||
|
||||
// Build the CMake project
|
||||
let dest = cmake_config.build();
|
||||
println!("cargo:rustc-link-search=native={}", dest.display());
|
||||
|
||||
@@ -52,8 +52,6 @@ endif()
|
||||
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
else()
|
||||
message("Building CUDA backend in ${CMAKE_BUILD_TYPE}")
|
||||
endif()
|
||||
|
||||
# Add OpenMP support
|
||||
|
||||
@@ -55,7 +55,6 @@ void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
|
||||
void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
uint64_t cuda_device_total_memory(uint32_t gpu_index);
|
||||
|
||||
void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
|
||||
uint64_t size,
|
||||
|
||||
@@ -24,15 +24,7 @@ using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
|
||||
return std::get<Torus *>(variant); \
|
||||
} \
|
||||
}()
|
||||
// Macro to define the visitor logic using std::holds_alternative for vectors
|
||||
#define GET_VARIANT_ELEMENT_64BIT(variant, index) \
|
||||
[&] { \
|
||||
if (std::holds_alternative<std::vector<uint64_t *>>(variant)) { \
|
||||
return std::get<std::vector<uint64_t *>>(variant)[index]; \
|
||||
} else { \
|
||||
return std::get<uint64_t *>(variant); \
|
||||
} \
|
||||
}()
|
||||
|
||||
int get_active_gpu_count(int num_inputs, int gpu_count);
|
||||
|
||||
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
|
||||
|
||||
@@ -395,8 +395,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -415,8 +414,7 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -543,100 +541,5 @@ void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
|
||||
void extend_radix_with_trivial_zero_blocks_msb_64(
|
||||
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
|
||||
void *const *streams, uint32_t const *gpu_indexes);
|
||||
|
||||
void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
void *const *streams,
|
||||
uint32_t const *gpu_indexes);
|
||||
|
||||
uint64_t scratch_cuda_apply_noise_squashing_kb(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_glwe_dimension,
|
||||
uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, uint32_t num_original_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_apply_noise_squashing_kb(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks);
|
||||
|
||||
void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
|
||||
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry);
|
||||
|
||||
void cleanup_cuda_sub_and_propagate_single_carry(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool is_divisor_power_of_two,
|
||||
bool log2_divisor_exceeds_threshold, bool multiplier_exceeds_threshold,
|
||||
uint32_t num_scalar_bits, uint32_t ilog2_divisor, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *ksks,
|
||||
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_scalars, bool multiplier_exceeds_threshold,
|
||||
bool is_divisor_power_of_two, bool log2_divisor_exceeds_threshold,
|
||||
uint32_t ilog2_divisor, uint64_t shift_pre, uint32_t shift_post,
|
||||
uint64_t rhs);
|
||||
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t num_additional_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_extend_radix_with_sign_msb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
|
||||
int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
|
||||
void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
} // extern C
|
||||
#endif // CUDA_INTEGER_H
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,13 +0,0 @@
|
||||
#ifndef CUDA_BOOTSTRAP_128_H
|
||||
#define CUDA_BOOTSTRAP_128_H
|
||||
|
||||
#include "pbs_enums.h"
|
||||
#include <stdint.h>
|
||||
|
||||
uint64_t scratch_cuda_programmable_bootstrap_128_vector_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
#endif // CUDA_BOOTSTRAP_128_H
|
||||
@@ -240,12 +240,14 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
|
||||
template <PBS_TYPE pbs_type> struct pbs_buffer_128;
|
||||
|
||||
template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
|
||||
int8_t *d_mem;
|
||||
|
||||
__uint128_t *global_accumulator;
|
||||
double *global_join_buffer;
|
||||
InputTorus *temp_lwe_array_in;
|
||||
__uint128_t *temp_lwe_array_in;
|
||||
uint64_t *trivial_indexes;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
@@ -263,9 +265,11 @@ template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
|
||||
this->pbs_variant = pbs_variant;
|
||||
this->uses_noise_reduction = allocate_ms_array;
|
||||
if (allocate_ms_array) {
|
||||
this->temp_lwe_array_in = (InputTorus *)cuda_malloc_async(
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(InputTorus),
|
||||
stream, gpu_index);
|
||||
this->temp_lwe_array_in =
|
||||
(__uint128_t *)cuda_malloc_with_size_tracking_async(
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(__uint128_t),
|
||||
stream, gpu_index, size_tracker, allocate_ms_array);
|
||||
this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
|
||||
input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
|
||||
size_tracker, allocate_ms_array);
|
||||
@@ -521,10 +525,6 @@ bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
|
||||
uint32_t level_count,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
bool has_support_to_cuda_programmable_bootstrap_128_cg(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t max_shared_memory);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
|
||||
@@ -100,7 +100,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void const *lut_vector, void const *lwe_array_in,
|
||||
void const *bootstrapping_key,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void const *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
|
||||
void *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
|
||||
@@ -112,15 +112,15 @@ template <typename Torus> struct zk_expand_mem {
|
||||
|
||||
// Hint for future readers: if message_modulus == 4 then
|
||||
// packed_messages_per_lwe becomes 2
|
||||
auto num_packed_msgs = log2_int(params.message_modulus);
|
||||
auto packed_messages_per_lwe = log2_int(params.message_modulus);
|
||||
|
||||
// Adjust indexes to permute the output and access the correct LUT
|
||||
auto h_indexes_in = static_cast<Torus *>(
|
||||
malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
|
||||
malloc(packed_messages_per_lwe * num_lwes * sizeof(Torus)));
|
||||
auto h_indexes_out = static_cast<Torus *>(
|
||||
malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
|
||||
malloc(packed_messages_per_lwe * num_lwes * sizeof(Torus)));
|
||||
auto h_lut_indexes = static_cast<Torus *>(
|
||||
malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
|
||||
malloc(packed_messages_per_lwe * num_lwes * sizeof(Torus)));
|
||||
auto h_body_id_per_compact_list =
|
||||
static_cast<uint32_t *>(malloc(num_lwes * sizeof(uint32_t)));
|
||||
auto h_lwe_compact_input_indexes =
|
||||
@@ -138,10 +138,6 @@ template <typename Torus> struct zk_expand_mem {
|
||||
auto compact_list_id = 0;
|
||||
auto idx = 0;
|
||||
auto count = 0;
|
||||
// During flatenning, all num_lwes LWEs from all compact lists are stored
|
||||
// sequentially on a Torus array. h_lwe_compact_input_indexes stores the
|
||||
// index of the first LWE related to the compact list that contains the i-th
|
||||
// LWE
|
||||
for (int i = 0; i < num_lwes; i++) {
|
||||
h_lwe_compact_input_indexes[i] = idx;
|
||||
count++;
|
||||
@@ -152,8 +148,6 @@ template <typename Torus> struct zk_expand_mem {
|
||||
}
|
||||
}
|
||||
|
||||
// Stores the index of the i-th LWE (within each compact list) related to
|
||||
// the k-th compact list.
|
||||
auto offset = 0;
|
||||
for (int k = 0; k < num_compact_lists; k++) {
|
||||
auto num_lwes_in_kth_compact_list = num_lwes_per_compact_list[k];
|
||||
@@ -165,75 +159,46 @@ template <typename Torus> struct zk_expand_mem {
|
||||
offset += num_lwes_in_kth_compact_list;
|
||||
}
|
||||
|
||||
/*
|
||||
* Each LWE contains encrypted data in both carry and message spaces
|
||||
* that needs to be extracted.
|
||||
*
|
||||
* The loop processes each compact list (k) and for each LWE within that
|
||||
* list:
|
||||
* 1. Sets input indexes to read each LWE twice (for carry and message
|
||||
* extraction)
|
||||
* 2. Creates output indexes to properly reorder the results
|
||||
* 3. Selects appropriate LUT index based on whether boolean sanitization is
|
||||
* needed
|
||||
*
|
||||
* We want the output to have always first the content of the message part
|
||||
* and then the content of the carry part of each LWE.
|
||||
*
|
||||
* i.e. msg_extract(LWE_0), carry_extract(LWE_0), msg_extract(LWE_1),
|
||||
* carry_extract(LWE_1), ...
|
||||
*
|
||||
* Aiming that behavior, with 4 LWEs we would have:
|
||||
*
|
||||
* // Each LWE is processed twice
|
||||
* h_indexes_in = {0, 1, 2, 3, 0, 1, 2, 3}
|
||||
*
|
||||
* // First 4 use message LUT, last 4 use carry LUT
|
||||
* h_lut_indexes = {0, 0, 0, 0, 1, 1, 1, 1}
|
||||
*
|
||||
* // Reorders output so message and carry for each LWE appear together
|
||||
* h_indexes_out = {0, 2, 4, 6, 1, 3, 5, 7}
|
||||
*
|
||||
* If an LWE contains a boolean value, its LUT index is shifted by
|
||||
* num_packed_msgs to use the sanitization LUT (which ensures output is
|
||||
* exactly 0 or 1).
|
||||
*/
|
||||
offset = 0;
|
||||
for (int k = 0; k < num_compact_lists; k++) {
|
||||
auto num_lwes_in_kth = num_lwes_per_compact_list[k];
|
||||
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
|
||||
auto lwe_index = i + num_packed_msgs * offset;
|
||||
auto lwe_index_in_list = i % num_lwes_in_kth;
|
||||
h_indexes_in[lwe_index] = lwe_index_in_list + offset;
|
||||
h_indexes_out[lwe_index] =
|
||||
num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
|
||||
auto num_lwes_in_kth_compact_list = num_lwes_per_compact_list[k];
|
||||
for (int i = 0;
|
||||
i < packed_messages_per_lwe * num_lwes_in_kth_compact_list; i++) {
|
||||
Torus j = i % num_lwes_in_kth_compact_list;
|
||||
h_indexes_in[i + packed_messages_per_lwe * offset] = j + offset;
|
||||
h_indexes_out[i + packed_messages_per_lwe * offset] =
|
||||
packed_messages_per_lwe * (j + offset) +
|
||||
(i / num_lwes_in_kth_compact_list);
|
||||
// If the input relates to a boolean, shift the LUT so the correct one
|
||||
// with sanitization is used
|
||||
auto boolean_offset =
|
||||
is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
|
||||
h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
|
||||
h_lut_indexes[i + packed_messages_per_lwe * offset] =
|
||||
(is_boolean_array[h_indexes_out[i +
|
||||
packed_messages_per_lwe * offset]]
|
||||
? packed_messages_per_lwe
|
||||
: 0) +
|
||||
i / num_lwes_in_kth_compact_list;
|
||||
}
|
||||
offset += num_lwes_in_kth;
|
||||
offset += num_lwes_in_kth_compact_list;
|
||||
}
|
||||
|
||||
message_and_carry_extract_luts->set_lwe_indexes(
|
||||
streams[0], gpu_indexes[0], h_indexes_in, h_indexes_out);
|
||||
auto lut_indexes = message_and_carry_extract_luts->get_lut_indexes(0, 0);
|
||||
message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
d_lwe_compact_input_indexes, h_lwe_compact_input_indexes,
|
||||
num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
|
||||
allocate_gpu_memory);
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lut_indexes, h_lut_indexes, num_packed_msgs * num_lwes * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
lut_indexes, h_lut_indexes,
|
||||
packed_messages_per_lwe * num_lwes * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
d_body_id_per_compact_list, h_body_id_per_compact_list,
|
||||
num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
|
||||
allocate_gpu_memory);
|
||||
|
||||
message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
// The expanded LWEs will always be on the casting key format
|
||||
tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
|
||||
@@ -84,8 +84,6 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out), size, log_modulus);
|
||||
}
|
||||
|
||||
// This end point is used only for testing purposes
|
||||
// its output always follows trivial ordering
|
||||
void cuda_improve_noise_modulus_switch_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *lwe_array_indexes,
|
||||
|
||||
@@ -38,16 +38,6 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
|
||||
// Each thread in x are used to calculate one output.
|
||||
// threads in y are used to paralelize the lwe_dimension_in loop.
|
||||
// shared memory is used to store intermediate results of the reduction.
|
||||
// Note: To reduce register pressure we have slightly changed the algorithm,
|
||||
// the idea consists in calculating the negate value of the output. So, instead
|
||||
// of accumulating subtractions using -=, we accumulate additions using += in
|
||||
// the local_lwe_out. This seems to work better cause profits madd ops and save
|
||||
// some regs. For this to work, we need to negate the input
|
||||
// lwe_array_in[lwe_dimension_in], and negate back the output at the end to get
|
||||
// the correct results. Additionally, we split the calculation of the ksk offset
|
||||
// in two parts, a constant part is calculated before the loop, and a variable
|
||||
// part is calculated inside the loop. This seems to help with the register
|
||||
// pressure as well.
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
@@ -70,7 +60,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);
|
||||
|
||||
if (tid == lwe_dimension_out && threadIdx.y == 0) {
|
||||
local_lwe_out = -block_lwe_array_in[lwe_dimension_in];
|
||||
local_lwe_out = block_lwe_array_in[lwe_dimension_in];
|
||||
}
|
||||
const Torus mask_mod_b = (1ll << base_log) - 1ll;
|
||||
|
||||
@@ -83,12 +73,12 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
for (int i = start_i; i < end_i; i++) {
|
||||
Torus state =
|
||||
init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
|
||||
uint32_t offset = i * level_count * (lwe_dimension_out + 1);
|
||||
for (int j = 0; j < level_count; j++) {
|
||||
|
||||
for (int j = 0; j < level_count; j++) {
|
||||
auto ksk_block =
|
||||
get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
|
||||
Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
|
||||
local_lwe_out +=
|
||||
(Torus)ksk[tid + j * (lwe_dimension_out + 1) + offset] * decomposed;
|
||||
local_lwe_out -= (Torus)ksk_block[tid] * decomposed;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,7 +93,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
lwe_acc_out[shmem_index + offset * blockDim.x];
|
||||
}
|
||||
if (threadIdx.y == 0)
|
||||
block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
|
||||
block_lwe_array_out[tid] = lwe_acc_out[shmem_index];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -182,14 +172,14 @@ __host__ uint64_t scratch_packing_keyswitch_lwe_list_to_glwe(
|
||||
|
||||
// allocate at least LWE-mask times two: to keep both decomposition state and
|
||||
// decomposed intermediate value
|
||||
uint64_t memory_unit = glwe_accumulator_size > lwe_dimension * 2
|
||||
? glwe_accumulator_size
|
||||
: lwe_dimension * 2;
|
||||
int memory_unit = glwe_accumulator_size > lwe_dimension * 2
|
||||
? glwe_accumulator_size
|
||||
: lwe_dimension * 2;
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
uint64_t buffer_size = 2 * num_lwes * memory_unit * sizeof(Torus);
|
||||
uint64_t size_tracker;
|
||||
*fp_ks_buffer = (int8_t *)cuda_malloc_with_size_tracking_async(
|
||||
buffer_size, stream, gpu_index, &size_tracker, allocate_gpu_memory);
|
||||
2 * num_lwes * memory_unit * sizeof(Torus), stream, gpu_index,
|
||||
&size_tracker, allocate_gpu_memory);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -178,12 +178,10 @@ __device__ __forceinline__ double measure_modulus_switch_noise(
|
||||
|
||||
// Each thread processes two elements of the lwe array
|
||||
template <typename Torus>
|
||||
__global__ void __launch_bounds__(512)
|
||||
improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
|
||||
const uint64_t *indexes, const Torus *zeros,
|
||||
int lwe_size, int num_zeros,
|
||||
double input_variance, double r_sigma,
|
||||
double bound, uint32_t log_modulus) {
|
||||
__global__ void improve_noise_modulus_switch(
|
||||
Torus *array_out, const Torus *array_in, const uint64_t *indexes,
|
||||
const Torus *zeros, int lwe_size, int num_zeros, double input_variance,
|
||||
double r_sigma, double bound, uint32_t log_modulus) {
|
||||
|
||||
// First we will assume size is less than the number of threads per block
|
||||
// I should switch this to dynamic shared memory
|
||||
@@ -200,8 +198,7 @@ __global__ void __launch_bounds__(512)
|
||||
sum_mask_errors[threadIdx.x] = 0.f;
|
||||
sum_squared_mask_errors[threadIdx.x] = 0.f;
|
||||
auto this_block_lwe_in = array_in + indexes[blockIdx.x] * lwe_size;
|
||||
// We use modulus switch to gather the output in trivial order
|
||||
auto this_block_lwe_out = array_out + blockIdx.x * lwe_size;
|
||||
auto this_block_lwe_out = array_out + indexes[blockIdx.x] * lwe_size;
|
||||
Torus input_element1 = this_block_lwe_in[threadIdx.x];
|
||||
|
||||
Torus input_element2 = threadIdx.x + blockDim.x < lwe_size
|
||||
|
||||
@@ -122,13 +122,6 @@ bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t cuda_device_total_memory(uint32_t gpu_index) {
|
||||
cuda_set_device(gpu_index);
|
||||
size_t total_mem = 0, free_mem = 0;
|
||||
check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
|
||||
return total_mem;
|
||||
}
|
||||
|
||||
/// Returns
|
||||
/// false if Cooperative Groups is not supported.
|
||||
/// true otherwise
|
||||
|
||||
@@ -62,8 +62,8 @@ void update_degrees_after_bitor(uint64_t *output_degrees,
|
||||
auto min = std::min(lwe_array_1_degrees[i], lwe_array_2_degrees[i]);
|
||||
auto result = max;
|
||||
|
||||
for (uint j = 0; j < min + 1; j++) {
|
||||
if ((max | j) > result) {
|
||||
for (uint64_t j = 0; j < min + 1; j++) {
|
||||
if (max | j > result) {
|
||||
result = max | j;
|
||||
}
|
||||
}
|
||||
@@ -79,13 +79,16 @@ void update_degrees_after_bitxor(uint64_t *output_degrees,
|
||||
auto max = std::max(lwe_array_1_degrees[i], lwe_array_2_degrees[i]);
|
||||
auto min = std::min(lwe_array_1_degrees[i], lwe_array_2_degrees[i]);
|
||||
auto result = max;
|
||||
printf("max %lu, min %lu, result %d\n", max, min, result);
|
||||
|
||||
// Try every possibility to find the worst case
|
||||
for (uint j = 0; j < min + 1; j++) {
|
||||
for (uint64_t j = 0; j < min + 1; j++) {
|
||||
printf("j %lu, max ^ j %lu \n", j, max ^ j);
|
||||
if ((max ^ j) > result) {
|
||||
result = max ^ j;
|
||||
}
|
||||
}
|
||||
output_degrees[i] = result;
|
||||
printf("output degree %lu\n", result);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,60 +3,7 @@
|
||||
void extend_radix_with_trivial_zero_blocks_msb_64(
|
||||
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
|
||||
void *const *streams, uint32_t const *gpu_indexes) {
|
||||
|
||||
host_extend_radix_with_trivial_zero_blocks_msb<uint64_t>(
|
||||
output, input, (cudaStream_t *)streams, gpu_indexes);
|
||||
}
|
||||
|
||||
void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
void *const *streams,
|
||||
uint32_t const *gpu_indexes) {
|
||||
|
||||
host_trim_radix_blocks_lsb<uint64_t>(output, input, (cudaStream_t *)streams,
|
||||
gpu_indexes);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t num_additional_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
return scratch_extend_radix_with_sign_msb<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count,
|
||||
(int_extend_radix_with_sign_msb_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_blocks, num_additional_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_extend_radix_with_sign_msb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
|
||||
int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
host_extend_radix_with_sign_msb<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, output, input,
|
||||
(int_extend_radix_with_sign_msb_buffer<uint64_t> *)mem_ptr,
|
||||
num_additional_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_extend_radix_with_sign_msb_buffer<uint64_t> *mem_ptr =
|
||||
(int_extend_radix_with_sign_msb_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
@@ -9,86 +9,10 @@ template <typename Torus>
|
||||
__host__ void host_extend_radix_with_trivial_zero_blocks_msb(
|
||||
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes) {
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
|
||||
0, input->num_radix_blocks, input, 0,
|
||||
input->num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes) {
|
||||
|
||||
const uint32_t input_start_lwe_index =
|
||||
input->num_radix_blocks - output->num_radix_blocks;
|
||||
|
||||
if (input->num_radix_blocks <= output->num_radix_blocks) {
|
||||
PANIC("Cuda error: input num blocks should be greater than output num "
|
||||
"blocks");
|
||||
}
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], output, 0, output->num_radix_blocks, input,
|
||||
input_start_lwe_index, input->num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_extend_radix_with_sign_msb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_extend_radix_with_sign_msb_buffer<Torus> **mem_ptr,
|
||||
const int_radix_params params, uint32_t num_radix_blocks,
|
||||
uint32_t num_additional_blocks, const bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_extend_radix_with_sign_msb_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
num_additional_blocks, allocate_gpu_memory, &size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_extend_radix_with_sign_msb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
int_extend_radix_with_sign_msb_buffer<Torus> *mem_ptr,
|
||||
uint32_t num_additional_blocks, void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
if (num_additional_blocks == 0) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
|
||||
input);
|
||||
return;
|
||||
}
|
||||
|
||||
const uint32_t input_blocks = input->num_radix_blocks;
|
||||
|
||||
if (input_blocks == 0) {
|
||||
PANIC("Cuda error: input blocks cannot be zero");
|
||||
}
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
|
||||
0, input_blocks, input, 0,
|
||||
input_blocks);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->last_block, 0, 1, input,
|
||||
input_blocks - 1, input_blocks);
|
||||
|
||||
host_apply_univariate_lut_kb(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->padding_block,
|
||||
mem_ptr->last_block, mem_ptr->lut, ksks, ms_noise_reduction_key, bsks);
|
||||
|
||||
for (uint32_t i = 0; i < num_additional_blocks; ++i) {
|
||||
uint32_t dst_block_idx = input_blocks + i;
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
|
||||
dst_block_idx, dst_block_idx + 1,
|
||||
mem_ptr->padding_block, 0, 1);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -386,69 +386,3 @@ void reverseArray(uint64_t arr[], size_t n) {
|
||||
end--;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_apply_noise_squashing_mem(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_params params, int_noise_squashing_lut<uint64_t> **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t num_radix_blocks, uint32_t original_num_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_noise_squashing_lut<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, params, glwe_dimension,
|
||||
polynomial_size, num_radix_blocks, original_num_blocks,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_apply_noise_squashing_kb(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_glwe_dimension,
|
||||
uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, uint32_t original_num_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
PUSH_RANGE("scratch noise squashing")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
return scratch_cuda_apply_noise_squashing_mem(
|
||||
streams, gpu_indexes, gpu_count, params,
|
||||
(int_noise_squashing_lut<uint64_t> **)mem_ptr, input_glwe_dimension,
|
||||
input_polynomial_size, num_radix_blocks, original_num_blocks,
|
||||
allocate_gpu_memory);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cuda_apply_noise_squashing_kb(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks) {
|
||||
|
||||
PUSH_RANGE("apply noise squashing")
|
||||
integer_radix_apply_noise_squashing_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
|
||||
input_radix_lwe, (int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup noise squashing")
|
||||
int_noise_squashing_lut<uint64_t> *mem_ptr =
|
||||
(int_noise_squashing_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
#include "linear_algebra.h"
|
||||
#include "linearalgebra/addition.cuh"
|
||||
#include "linearalgebra/negation.cuh"
|
||||
#include "pbs/pbs_128_utilities.h"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
@@ -521,7 +520,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
if (num_radix_blocks > lut->num_blocks)
|
||||
PANIC("Cuda error: num radix blocks on which lut is applied should be "
|
||||
"smaller or equal to the number of lut radix blocks")
|
||||
if (num_radix_blocks > lwe_array_out->num_radix_blocks)
|
||||
if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
|
||||
num_radix_blocks > lwe_array_in->num_radix_blocks)
|
||||
PANIC("Cuda error: num radix blocks on which lut is applied should be "
|
||||
"smaller or equal to the number of input & output radix blocks")
|
||||
|
||||
@@ -866,7 +866,7 @@ uint64_t generate_lookup_table_with_encoding(
|
||||
memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));
|
||||
|
||||
auto body = &acc[glwe_dimension * polynomial_size];
|
||||
Torus degree = 0;
|
||||
uint64_t degree = 0;
|
||||
|
||||
// This accumulator extracts the carry bits
|
||||
for (int i = 0; i < input_modulus_sup; i++) {
|
||||
@@ -886,7 +886,7 @@ uint64_t generate_lookup_table_with_encoding(
|
||||
}
|
||||
|
||||
rotate_left<Torus>(body, half_box_size, polynomial_size);
|
||||
return (uint64_t)degree;
|
||||
return degree;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -1834,6 +1834,9 @@ void host_propagate_single_carry(
|
||||
PUSH_RANGE("propagate sc")
|
||||
auto num_radix_blocks = lwe_array->num_radix_blocks;
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
auto lut_stride = mem->lut_stride;
|
||||
auto num_many_lut = mem->num_many_lut;
|
||||
CudaRadixCiphertextFFI output_flag;
|
||||
@@ -1849,7 +1852,6 @@ void host_propagate_single_carry(
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
|
||||
input_carries, 1);
|
||||
}
|
||||
|
||||
// Step 1
|
||||
host_compute_shifted_blocks_and_states<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, mem->shifted_blocks_state_mem,
|
||||
@@ -1935,26 +1937,15 @@ void host_add_and_propagate_single_carry(
|
||||
PUSH_RANGE("add & propagate sc")
|
||||
if (lhs_array->num_radix_blocks != rhs_array->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be the same")
|
||||
|
||||
// Check input carries if used
|
||||
if (uses_carry == 1) {
|
||||
if (input_carries == nullptr)
|
||||
PANIC("Cuda error: if uses_carry is enabled, input_carries cannot be a "
|
||||
"null pointer");
|
||||
if (lhs_array->lwe_dimension != input_carries->lwe_dimension)
|
||||
PANIC(
|
||||
"Cuda error: input and input_carries lwe dimension must be the same");
|
||||
}
|
||||
|
||||
// Allow nullptr for carry_out if FLAG_NONE is requested
|
||||
if (lhs_array->lwe_dimension != rhs_array->lwe_dimension ||
|
||||
lhs_array->lwe_dimension != input_carries->lwe_dimension ||
|
||||
lhs_array->lwe_dimension != carry_out->lwe_dimension)
|
||||
PANIC("Cuda error: input and output lwe dimension must be the same")
|
||||
if ((requested_flag == outputFlag::FLAG_OVERFLOW ||
|
||||
requested_flag == outputFlag::FLAG_CARRY)) {
|
||||
if (carry_out == nullptr)
|
||||
PANIC("Cuda error: when requesting FLAG_CARRY or FLAG_OVERFLOW, "
|
||||
"carry_out must be a valid pointer")
|
||||
if (lhs_array->lwe_dimension != carry_out->lwe_dimension)
|
||||
PANIC("Cuda error: input and carry_out lwe dimension must be the same")
|
||||
}
|
||||
requested_flag == outputFlag::FLAG_CARRY) &&
|
||||
carry_out == nullptr)
|
||||
PANIC("Cuda error: when requesting FLAG_CARRY, carry_out must be a valid "
|
||||
"pointer")
|
||||
|
||||
auto num_radix_blocks = lhs_array->num_radix_blocks;
|
||||
auto params = mem->params;
|
||||
@@ -2058,7 +2049,6 @@ void host_add_and_propagate_single_carry(
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], lhs_array, 0, num_radix_blocks,
|
||||
mem->output_flag, 0, num_radix_blocks);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], carry_out, 0, 1, mem->output_flag,
|
||||
num_radix_blocks, num_radix_blocks + 1);
|
||||
@@ -2210,110 +2200,4 @@ void host_single_borrow_propagate(
|
||||
}
|
||||
}
|
||||
|
||||
/// num_radix_blocks corresponds to the number of blocks on which to apply the
|
||||
/// LUT In scalar bitops we use a number of blocks that may be lower or equal to
|
||||
/// the input and output numbers of blocks
|
||||
template <typename InputTorus>
|
||||
__host__ void integer_radix_apply_noise_squashing_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_noise_squashing_lut<InputTorus> *lut, void *const *bsks,
|
||||
InputTorus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
PUSH_RANGE("apply noise squashing")
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto small_lwe_dimension = params.small_lwe_dimension;
|
||||
auto ks_level = params.ks_level;
|
||||
auto ks_base_log = params.ks_base_log;
|
||||
auto pbs_level = params.pbs_level;
|
||||
auto pbs_base_log = params.pbs_base_log;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto grouping_factor = params.grouping_factor;
|
||||
|
||||
if (lwe_array_out->num_radix_blocks !=
|
||||
(lwe_array_in->num_radix_blocks + 1) / 2)
|
||||
PANIC("Cuda error: num output radix blocks should be "
|
||||
"half ceil the number input radix blocks")
|
||||
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
|
||||
std::vector<InputTorus *> lwe_array_in_vec = lut->lwe_array_in_vec;
|
||||
std::vector<InputTorus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
|
||||
std::vector<__uint128_t *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
|
||||
std::vector<InputTorus *> lwe_trivial_indexes_vec =
|
||||
lut->lwe_trivial_indexes_vec;
|
||||
|
||||
// We know carry is empty so we can pack two blocks in one
|
||||
pack_blocks<InputTorus>(streams[0], gpu_indexes[0], lwe_array_pbs_in,
|
||||
lwe_array_in, lwe_array_in->num_radix_blocks,
|
||||
params.message_modulus);
|
||||
|
||||
// Since the radix ciphertexts are packed, we have to use the num_radix_blocks
|
||||
// from the output ct
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(lwe_array_out->num_radix_blocks, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
execute_keyswitch_async<InputTorus>(
|
||||
streams, gpu_indexes, 1, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], (InputTorus *)lwe_array_pbs_in->ptr,
|
||||
lut->lwe_indexes_in, ksks, lut->input_big_lwe_dimension,
|
||||
small_lwe_dimension, ks_base_log, ks_level,
|
||||
lwe_array_out->num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_128_async<__uint128_t>(
|
||||
streams, gpu_indexes, 1, (__uint128_t *)lwe_array_out->ptr,
|
||||
lut->lut_vec, lwe_after_ks_vec[0], bsks, ms_noise_reduction_key,
|
||||
lut->pbs_buffer, small_lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, lwe_array_out->num_radix_blocks);
|
||||
} else {
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
/// With multiple GPUs we push to the vectors on each GPU then when we
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
multi_gpu_scatter_lwe_async<InputTorus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
(InputTorus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
|
||||
lut->using_trivial_lwe_indexes, lwe_array_out->num_radix_blocks,
|
||||
lut->input_big_lwe_dimension + 1);
|
||||
|
||||
execute_keyswitch_async<InputTorus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
|
||||
lwe_trivial_indexes_vec, lwe_array_in_vec, lwe_trivial_indexes_vec,
|
||||
ksks, lut->input_big_lwe_dimension, small_lwe_dimension, ks_base_log,
|
||||
ks_level, lwe_array_out->num_radix_blocks);
|
||||
|
||||
execute_pbs_128_async<__uint128_t>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec, lut->lut_vec,
|
||||
lwe_after_ks_vec, bsks, ms_noise_reduction_key, lut->pbs_buffer,
|
||||
small_lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, lwe_array_out->num_radix_blocks);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<__uint128_t>(
|
||||
streams, gpu_indexes, active_gpu_count,
|
||||
(__uint128_t *)lwe_array_out->ptr, lwe_after_pbs_vec,
|
||||
(__uint128_t *)lut->h_lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
lwe_array_out->num_radix_blocks, big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
for (uint i = 0; i < lut->num_blocks; i++) {
|
||||
lwe_array_out->degrees[i] = lut->degrees[0];
|
||||
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_INTERNAL_INTEGER_CUH
|
||||
|
||||
@@ -210,8 +210,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
@@ -221,8 +220,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
return scratch_cuda_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
|
||||
max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation, params,
|
||||
allocate_gpu_memory);
|
||||
max_num_radix_in_vec, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
@@ -236,13 +234,19 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
|
||||
PANIC("Cuda error: input vector length should be a multiple of the "
|
||||
"output's number of radix blocks")
|
||||
// FIXME: this should not be necessary, we should make sure sum_ctxt works in
|
||||
// the general case
|
||||
for (int i = 0; i < radix_lwe_vec->num_radix_blocks; i++) {
|
||||
radix_lwe_vec->degrees[i] = mem->params.message_modulus - 1;
|
||||
}
|
||||
switch (mem->params.polynomial_size) {
|
||||
case 512:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
@@ -250,7 +254,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
@@ -258,7 +263,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
@@ -266,7 +272,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
@@ -274,7 +281,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
@@ -282,7 +290,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
|
||||
@@ -24,6 +24,24 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void smart_copy(Torus *dst, Torus *src, int32_t *id_out,
|
||||
int32_t *id_in, size_t lwe_size) {
|
||||
size_t tid = threadIdx.x;
|
||||
size_t b_id = blockIdx.x;
|
||||
size_t stride = blockDim.x;
|
||||
|
||||
auto input_id = id_in[b_id];
|
||||
auto output_id = id_out[b_id];
|
||||
|
||||
auto cur_src = (input_id >= 0) ? &src[input_id * lwe_size] : nullptr;
|
||||
auto cur_dst = &dst[output_id * lwe_size];
|
||||
|
||||
for (int i = tid; i < lwe_size; i += stride) {
|
||||
cur_dst[i] = (input_id >= 0) ? cur_src[i] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void
|
||||
all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
|
||||
@@ -76,155 +94,33 @@ all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
|
||||
}
|
||||
}
|
||||
|
||||
__global__ inline void radix_vec_to_columns(uint32_t *const *const columns,
|
||||
uint32_t *const columns_counter,
|
||||
const uint64_t *const degrees,
|
||||
const uint32_t num_radix_blocks,
|
||||
const uint32_t num_radix_in_vec) {
|
||||
template <typename Torus>
|
||||
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
uint32_t chunk_size, uint32_t block_size,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
const uint32_t idx = threadIdx.x;
|
||||
size_t cnt = 0;
|
||||
for (int i = 0; i < num_radix_in_vec; i++) {
|
||||
size_t ct_id = i * num_radix_blocks + idx;
|
||||
if (degrees[ct_id] != 0) {
|
||||
columns[idx][cnt] = ct_id;
|
||||
++cnt;
|
||||
size_t stride = blockDim.x;
|
||||
size_t chunk_id = blockIdx.x;
|
||||
size_t chunk_elem_size = chunk_size * num_blocks * block_size;
|
||||
size_t radix_elem_size = num_blocks * block_size;
|
||||
auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
|
||||
auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
|
||||
size_t block_stride = blockIdx.y * block_size;
|
||||
auto result = &dst_radix[block_stride];
|
||||
|
||||
// init shared mem with first radix of chunk
|
||||
size_t tid = threadIdx.x;
|
||||
for (int i = tid; i < block_size; i += stride) {
|
||||
result[i] = src_chunk[block_stride + i];
|
||||
}
|
||||
|
||||
// accumulate rest of the radixes
|
||||
for (int r_id = 1; r_id < chunk_size; r_id++) {
|
||||
auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
|
||||
for (int i = tid; i < block_size; i += stride) {
|
||||
result[i] += cur_src_radix[block_stride + i];
|
||||
}
|
||||
}
|
||||
columns_counter[idx] = cnt;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ inline void prepare_new_columns_and_pbs_indexes(
|
||||
uint32_t *const *const new_columns, uint32_t *const new_columns_counter,
|
||||
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
|
||||
Torus *const lut_indexes, const uint32_t *const *const columns,
|
||||
const uint32_t *const columns_counter, const uint32_t chunk_size) {
|
||||
__shared__ uint32_t counter;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
counter = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
const uint32_t base_id = threadIdx.x;
|
||||
const uint32_t column_len = columns_counter[base_id];
|
||||
|
||||
uint32_t ct_count = 0;
|
||||
for (uint32_t i = 0; i + chunk_size <= column_len; i += chunk_size) {
|
||||
// those indexes are for message ciphertexts
|
||||
// for message ciphertexts in and out index should be same
|
||||
const uint32_t in_index = columns[base_id][i];
|
||||
new_columns[base_id][ct_count] = in_index;
|
||||
const uint32_t pbs_index = atomicAdd(&counter, 1);
|
||||
pbs_indexes_in[pbs_index] = in_index;
|
||||
pbs_indexes_out[pbs_index] = in_index;
|
||||
lut_indexes[pbs_index] = 0;
|
||||
++ct_count;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (base_id > 0) {
|
||||
const uint32_t prev_base_id = base_id - 1;
|
||||
const uint32_t prev_column_len = columns_counter[prev_base_id];
|
||||
|
||||
for (uint32_t i = 0; i + chunk_size <= prev_column_len; i += chunk_size) {
|
||||
// those indexes are for carry ciphertexts
|
||||
// for carry ciphertexts input is same as for message
|
||||
// output will be placed to next block in the column
|
||||
const uint32_t in_index = columns[prev_base_id][i];
|
||||
const uint32_t out_index = columns[prev_base_id][i + 1];
|
||||
new_columns[base_id][ct_count] = out_index;
|
||||
const uint32_t pbs_index = atomicAdd(&counter, 1);
|
||||
pbs_indexes_in[pbs_index] = in_index;
|
||||
pbs_indexes_out[pbs_index] = out_index;
|
||||
lut_indexes[pbs_index] = 1;
|
||||
++ct_count;
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t start_index = column_len - column_len % chunk_size;
|
||||
for (uint32_t i = start_index; i < column_len; ++i) {
|
||||
new_columns[base_id][ct_count] = columns[base_id][i];
|
||||
++ct_count;
|
||||
}
|
||||
|
||||
new_columns_counter[base_id] = ct_count;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ inline void prepare_final_pbs_indexes(
|
||||
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
|
||||
Torus *const lut_indexes, const uint32_t num_radix_blocks) {
|
||||
int idx = threadIdx.x;
|
||||
pbs_indexes_in[idx] = idx % num_radix_blocks;
|
||||
pbs_indexes_out[idx] = idx + idx / num_radix_blocks;
|
||||
lut_indexes[idx] = idx / num_radix_blocks;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void calculate_chunks(Torus *const input_blocks,
|
||||
const uint32_t *const *const columns,
|
||||
const uint32_t *const columns_counter,
|
||||
const uint32_t chunk_size,
|
||||
const uint32_t block_size) {
|
||||
|
||||
const uint32_t part_size = blockDim.x;
|
||||
const uint32_t base_id = blockIdx.x;
|
||||
const uint32_t part_id = blockIdx.y;
|
||||
const uint32_t coef_id = part_id * part_size + threadIdx.x;
|
||||
|
||||
if (coef_id >= block_size)
|
||||
return;
|
||||
|
||||
const uint32_t column_len = columns_counter[base_id];
|
||||
|
||||
if (column_len >= chunk_size) {
|
||||
const uint32_t num_chunks = column_len / chunk_size;
|
||||
Torus result = 0;
|
||||
|
||||
for (uint32_t chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
|
||||
const uint32_t first_ct_id = columns[base_id][chunk_id * chunk_size];
|
||||
result = input_blocks[first_ct_id * block_size + coef_id];
|
||||
|
||||
for (uint32_t ct_id = 1; ct_id < chunk_size; ++ct_id) {
|
||||
const uint32_t cur_ct_id =
|
||||
columns[base_id][chunk_id * chunk_size + ct_id];
|
||||
result += input_blocks[cur_ct_id * block_size + coef_id];
|
||||
}
|
||||
|
||||
input_blocks[first_ct_id * block_size + coef_id] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void calculate_final_chunk_into_radix(
|
||||
Torus *const out_radix, const Torus *const input_blocks,
|
||||
const uint32_t *const *const columns, const uint32_t *const columns_counter,
|
||||
const uint32_t chunk_size, const uint32_t block_size) {
|
||||
|
||||
const uint32_t part_size = blockDim.x;
|
||||
const uint32_t base_id = blockIdx.x;
|
||||
const uint32_t part_id = blockIdx.y;
|
||||
const uint32_t coef_id = part_id * part_size + threadIdx.x;
|
||||
|
||||
if (coef_id >= block_size)
|
||||
return;
|
||||
|
||||
const uint32_t column_len = columns_counter[base_id];
|
||||
|
||||
Torus result = 0;
|
||||
if (column_len) {
|
||||
const uint32_t first_ct_id = columns[base_id][0];
|
||||
result = input_blocks[first_ct_id * block_size + coef_id];
|
||||
|
||||
for (uint32_t i = 1; i < column_len; ++i) {
|
||||
const uint32_t cur_ct_it = columns[base_id][i];
|
||||
result += input_blocks[cur_ct_it * block_size + coef_id];
|
||||
}
|
||||
}
|
||||
out_radix[base_id * block_size + coef_id] = result;
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
@@ -271,20 +167,17 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
|
||||
(process_msb) ? cur_msb_ct[params::degree] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
bool reduce_degrees_for_single_carry_propagation, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks_in_radix,
|
||||
max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
max_num_radix_in_vec, allocate_gpu_memory, &size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -295,9 +188,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_radix_blocks, uint32_t num_radix_in_vec) {
|
||||
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
|
||||
auto big_lwe_size = big_lwe_dimension + 1;
|
||||
uint32_t num_radix_blocks, uint32_t num_radix_in_vec,
|
||||
int_radix_lut<Torus> *reused_lut) {
|
||||
|
||||
if (terms->lwe_dimension != radix_lwe_out->lwe_dimension)
|
||||
PANIC("Cuda error: output and input radix ciphertexts should have the same "
|
||||
@@ -307,232 +199,273 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
PANIC("Cuda error: input vector does not have enough blocks")
|
||||
if (num_radix_blocks > radix_lwe_out->num_radix_blocks)
|
||||
PANIC("Cuda error: output does not have enough blocks")
|
||||
if (num_radix_in_vec == 0)
|
||||
return;
|
||||
|
||||
auto current_blocks = mem_ptr->current_blocks;
|
||||
auto new_blocks = mem_ptr->new_blocks;
|
||||
auto new_blocks_copy = mem_ptr->new_blocks_copy;
|
||||
auto old_blocks = mem_ptr->old_blocks;
|
||||
auto small_lwe_vector = mem_ptr->small_lwe_vector;
|
||||
auto d_degrees = mem_ptr->d_degrees;
|
||||
auto d_columns = mem_ptr->d_columns;
|
||||
auto d_columns_counter = mem_ptr->d_columns_counter;
|
||||
auto d_new_columns = mem_ptr->d_new_columns;
|
||||
auto d_new_columns_counter = mem_ptr->d_new_columns_counter;
|
||||
|
||||
auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
|
||||
auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
|
||||
|
||||
auto message_modulus = mem_ptr->params.message_modulus;
|
||||
auto carry_modulus = mem_ptr->params.carry_modulus;
|
||||
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
|
||||
auto big_lwe_size = big_lwe_dimension + 1;
|
||||
auto glwe_dimension = mem_ptr->params.glwe_dimension;
|
||||
auto polynomial_size = mem_ptr->params.polynomial_size;
|
||||
auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
|
||||
auto chunk_size =
|
||||
(mem_ptr->params.message_modulus * mem_ptr->params.carry_modulus - 1) /
|
||||
(mem_ptr->params.message_modulus - 1);
|
||||
|
||||
size_t total_blocks_in_vec = num_radix_blocks * num_radix_in_vec;
|
||||
auto small_lwe_size = small_lwe_dimension + 1;
|
||||
|
||||
// In the case of extracting a single LWE this parameters are dummy
|
||||
uint32_t num_many_lut = 1;
|
||||
uint32_t lut_stride = 0;
|
||||
|
||||
if (terms->num_radix_blocks == 0) {
|
||||
if (terms->num_radix_blocks == 0)
|
||||
return;
|
||||
}
|
||||
if (num_radix_in_vec == 1) {
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
radix_lwe_out, 0, num_radix_blocks,
|
||||
terms, 0, num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
if (old_blocks != terms) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], old_blocks,
|
||||
terms);
|
||||
}
|
||||
if (num_radix_in_vec == 2) {
|
||||
CudaRadixCiphertextFFI terms_slice;
|
||||
as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
|
||||
2 * num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
|
||||
&terms_slice, num_radix_blocks);
|
||||
CudaRadixCiphertextFFI old_blocks_slice;
|
||||
as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
|
||||
num_radix_blocks, 2 * num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
|
||||
&old_blocks_slice, num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
if (current_blocks != terms) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
current_blocks, terms);
|
||||
size_t r = num_radix_in_vec;
|
||||
size_t total_modulus = message_modulus * carry_modulus;
|
||||
size_t message_max = message_modulus - 1;
|
||||
size_t chunk_size = (total_modulus - 1) / message_max;
|
||||
|
||||
size_t h_lwe_idx_in[terms->num_radix_blocks];
|
||||
size_t h_lwe_idx_out[terms->num_radix_blocks];
|
||||
int32_t h_smart_copy_in[terms->num_radix_blocks];
|
||||
int32_t h_smart_copy_out[terms->num_radix_blocks];
|
||||
|
||||
/// Here it is important to query the default max shared memory on device 0
|
||||
/// instead of cuda_get_max_shared_memory,
|
||||
/// to avoid bugs with tree_add_chunks trying to use too much shared memory
|
||||
auto max_shared_memory = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
|
||||
|
||||
// create lut object for message and carry
|
||||
// we allocate luts_message_carry in the host function (instead of scratch)
|
||||
// to reduce average memory consumption
|
||||
int_radix_lut<Torus> *luts_message_carry;
|
||||
size_t ch_amount = r / chunk_size;
|
||||
if (!ch_amount)
|
||||
ch_amount++;
|
||||
if (reused_lut == nullptr) {
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
|
||||
2 * ch_amount * num_radix_blocks, true, nullptr);
|
||||
} else {
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
|
||||
2 * ch_amount * num_radix_blocks, reused_lut, true, nullptr);
|
||||
}
|
||||
auto message_acc = luts_message_carry->get_lut(0, 0);
|
||||
auto carry_acc = luts_message_carry->get_lut(0, 1);
|
||||
|
||||
cuda_memcpy_async_to_gpu(d_degrees, current_blocks->degrees,
|
||||
total_blocks_in_vec * sizeof(uint64_t), streams[0],
|
||||
gpu_indexes[0]);
|
||||
// define functions for each accumulator
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
||||
return x / message_modulus;
|
||||
};
|
||||
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
radix_vec_to_columns<<<1, num_radix_blocks, 0, streams[0]>>>(
|
||||
d_columns, d_columns_counter, d_degrees, num_radix_blocks,
|
||||
num_radix_in_vec);
|
||||
// generate accumulators
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], message_acc,
|
||||
luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
lut_f_message, true);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
|
||||
luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, lut_f_carry, true);
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
bool needs_processing = false;
|
||||
radix_columns current_columns(current_blocks->degrees, num_radix_blocks,
|
||||
num_radix_in_vec, chunk_size, needs_processing);
|
||||
int number_of_threads = std::min(256, params::degree);
|
||||
int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
|
||||
const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
|
||||
while (r > 2) {
|
||||
size_t cur_total_blocks = r * num_radix_blocks;
|
||||
size_t ch_amount = r / chunk_size;
|
||||
if (!ch_amount)
|
||||
ch_amount++;
|
||||
dim3 add_grid(ch_amount, num_radix_blocks, 1);
|
||||
|
||||
mem_ptr->setup_lookup_tables(streams, gpu_indexes, gpu_count,
|
||||
num_radix_in_vec, current_blocks->degrees);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
|
||||
(Torus *)new_blocks->ptr, (Torus *)old_blocks->ptr,
|
||||
std::min(r, chunk_size), big_lwe_size, num_radix_blocks);
|
||||
|
||||
while (needs_processing) {
|
||||
auto luts_message_carry = mem_ptr->luts_message_carry;
|
||||
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
|
||||
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
|
||||
calculate_chunks<Torus>
|
||||
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
|
||||
(Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
|
||||
chunk_size, big_lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0, streams[0]>>>(
|
||||
d_new_columns, d_new_columns_counter, d_pbs_indexes_in,
|
||||
d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0), d_columns,
|
||||
d_columns_counter, chunk_size);
|
||||
size_t total_count = 0;
|
||||
size_t message_count = 0;
|
||||
size_t carry_count = 0;
|
||||
size_t sm_copy_count = 0;
|
||||
|
||||
uint32_t total_ciphertexts;
|
||||
uint32_t total_messages;
|
||||
current_columns.next_accumulation(total_ciphertexts, total_messages,
|
||||
needs_processing);
|
||||
generate_ids_update_degrees(
|
||||
terms->degrees, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
|
||||
h_smart_copy_out, ch_amount, r, num_radix_blocks, chunk_size,
|
||||
message_max, total_count, message_count, carry_count, sm_copy_count);
|
||||
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
|
||||
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
|
||||
luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
|
||||
h_lwe_idx_in, h_lwe_idx_out);
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(total_ciphertexts, gpu_count);
|
||||
size_t copy_size = sm_copy_count * sizeof(int32_t);
|
||||
cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
// inside d_smart_copy_in there are only -1 values
|
||||
// it's fine to call smart_copy with same pointer
|
||||
// as source and destination
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks_copy, 0, r * num_radix_blocks,
|
||||
new_blocks, 0, r * num_radix_blocks);
|
||||
smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
|
||||
(Torus *)new_blocks->ptr, (Torus *)new_blocks_copy->ptr,
|
||||
d_smart_copy_out, d_smart_copy_in, big_lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
if (carry_count > 0)
|
||||
cuda_set_value_async<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
luts_message_carry->get_lut_indexes(0, message_count), 1,
|
||||
carry_count);
|
||||
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
|
||||
std::vector<Torus *> small_lwe_vector_vec =
|
||||
luts_message_carry->lwe_after_ks_vec;
|
||||
std::vector<Torus *> lwe_after_pbs_vec =
|
||||
luts_message_carry->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec =
|
||||
luts_message_carry->lwe_trivial_indexes_vec;
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
/// After this keyswitch execution, we need to synchronize the streams
|
||||
/// because the keyswitch and PBS do not operate on the same number of
|
||||
/// inputs
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in,
|
||||
lwe_indexes_in, (Torus *)new_blocks->ptr, lwe_indexes_in, ksks,
|
||||
polynomial_size * glwe_dimension, small_lwe_dimension,
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out,
|
||||
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
|
||||
(Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks,
|
||||
ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
|
||||
} else {
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, new_blocks_vec,
|
||||
(Torus *)new_blocks->ptr, luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, message_count,
|
||||
big_lwe_size);
|
||||
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
/// After this keyswitch execution, we need to synchronize the streams
|
||||
/// because the keyswitch and PBS do not operate on the same number of
|
||||
/// inputs
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
|
||||
lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
|
||||
ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
|
||||
total_messages);
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
|
||||
|
||||
/// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
|
||||
/// different configuration
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, gpu_count, (Torus *)small_lwe_vector->ptr,
|
||||
small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, message_count,
|
||||
small_lwe_size);
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
|
||||
(Torus *)small_lwe_vector->ptr, luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, total_count,
|
||||
small_lwe_size);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
|
||||
d_pbs_indexes_out, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
|
||||
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
|
||||
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
} else {
|
||||
Torus *h_lwe_indexes_in_pinned;
|
||||
Torus *h_lwe_indexes_out_pinned;
|
||||
cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
|
||||
total_ciphertexts * sizeof(Torus));
|
||||
cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
|
||||
total_ciphertexts * sizeof(Torus));
|
||||
for (uint32_t i = 0; i < total_ciphertexts; i++) {
|
||||
h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
|
||||
h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
|
||||
}
|
||||
cuda_memcpy_async_to_cpu(
|
||||
h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
|
||||
total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_cpu(
|
||||
h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
|
||||
total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint32_t i = 0; i < total_ciphertexts; i++) {
|
||||
luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
|
||||
luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
|
||||
}
|
||||
cudaFreeHost(h_lwe_indexes_in_pinned);
|
||||
cudaFreeHost(h_lwe_indexes_out_pinned);
|
||||
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
|
||||
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, current_blocks, current_blocks, bsks,
|
||||
ksks, ms_noise_reduction_key, luts_message_carry, total_ciphertexts);
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr,
|
||||
lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out,
|
||||
luts_message_carry->using_trivial_lwe_indexes, total_count,
|
||||
big_lwe_size);
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
std::swap(d_columns, d_new_columns);
|
||||
std::swap(d_columns_counter, d_new_columns_counter);
|
||||
}
|
||||
|
||||
calculate_final_chunk_into_radix<Torus>
|
||||
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
|
||||
(Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
|
||||
d_columns, d_columns_counter, chunk_size, big_lwe_size);
|
||||
|
||||
if (mem_ptr->reduce_degrees_for_single_carry_propagation) {
|
||||
auto luts_message_carry = mem_ptr->luts_message_carry;
|
||||
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
|
||||
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
|
||||
prepare_final_pbs_indexes<Torus>
|
||||
<<<1, 2 * num_radix_blocks, 0, streams[0]>>>(
|
||||
d_pbs_indexes_in, d_pbs_indexes_out,
|
||||
luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
|
||||
|
||||
cuda_memset_async(
|
||||
(Torus *)(current_blocks->ptr) + big_lwe_size * num_radix_blocks, 0,
|
||||
big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(2 * num_radix_blocks, gpu_count);
|
||||
|
||||
if (active_gpu_count == 1) {
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
|
||||
big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
|
||||
mem_ptr->params.ks_level, num_radix_blocks);
|
||||
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
|
||||
d_pbs_indexes_out, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
|
||||
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
2 * num_radix_blocks, mem_ptr->params.pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
} else {
|
||||
uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
|
||||
Torus *h_lwe_indexes_in_pinned;
|
||||
Torus *h_lwe_indexes_out_pinned;
|
||||
cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
|
||||
num_blocks_in_apply_lut * sizeof(Torus));
|
||||
cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
|
||||
num_blocks_in_apply_lut * sizeof(Torus));
|
||||
for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
|
||||
h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
|
||||
h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
|
||||
}
|
||||
cuda_memcpy_async_to_cpu(
|
||||
h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
|
||||
num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_cpu(
|
||||
h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
|
||||
num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
|
||||
luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
|
||||
luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
|
||||
}
|
||||
cudaFreeHost(h_lwe_indexes_in_pinned);
|
||||
cudaFreeHost(h_lwe_indexes_out_pinned);
|
||||
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, current_blocks, radix_lwe_out,
|
||||
bsks, ksks, ms_noise_reduction_key, luts_message_carry,
|
||||
num_blocks_in_apply_lut);
|
||||
for (uint i = 0; i < total_count; i++) {
|
||||
auto degrees_index = luts_message_carry->h_lut_indexes[i];
|
||||
new_blocks->degrees[i] = luts_message_carry->degrees[degrees_index];
|
||||
new_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
}
|
||||
calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
|
||||
num_radix_blocks, num_radix_in_vec, chunk_size,
|
||||
mem_ptr->params.message_modulus);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
CudaRadixCiphertextFFI current_blocks_slice;
|
||||
as_radix_ciphertext_slice<Torus>(¤t_blocks_slice, current_blocks,
|
||||
num_radix_blocks, 2 * num_radix_blocks);
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
|
||||
current_blocks, ¤t_blocks_slice,
|
||||
num_radix_blocks);
|
||||
int rem_blocks = (r > chunk_size) ? r % chunk_size * num_radix_blocks : 0;
|
||||
int new_blocks_created = 2 * ch_amount * num_radix_blocks;
|
||||
|
||||
if (rem_blocks > 0)
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks, new_blocks_created,
|
||||
new_blocks_created + rem_blocks, old_blocks,
|
||||
cur_total_blocks - rem_blocks, cur_total_blocks);
|
||||
std::swap(new_blocks, old_blocks);
|
||||
r = (new_blocks_created + rem_blocks) / num_radix_blocks;
|
||||
}
|
||||
luts_message_carry->release(streams, gpu_indexes, gpu_count);
|
||||
delete (luts_message_carry);
|
||||
|
||||
CudaRadixCiphertextFFI old_blocks_slice;
|
||||
as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
|
||||
num_radix_blocks, 2 * num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
|
||||
&old_blocks_slice, num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
@@ -669,7 +602,7 @@ __host__ void host_integer_mult_radix_kb(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
|
||||
ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
|
||||
2 * num_blocks);
|
||||
2 * num_blocks, mem_ptr->luts_array);
|
||||
|
||||
auto scp_mem_ptr = mem_ptr->sc_prop_mem;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
|
||||
@@ -34,7 +34,7 @@ void update_degrees_after_scalar_bitor(uint64_t *output_degrees,
|
||||
auto result = max;
|
||||
|
||||
for (uint j = 0; j < min + 1; j++) {
|
||||
if ((max | j) > result) {
|
||||
if (max | j > result) {
|
||||
result = max | j;
|
||||
}
|
||||
}
|
||||
@@ -52,7 +52,7 @@ void update_degrees_after_scalar_bitxor(uint64_t *output_degrees,
|
||||
|
||||
// Try every possibility to find the worst case
|
||||
for (uint j = 0; j < min + 1; j++) {
|
||||
if ((max ^ j) > result) {
|
||||
if (max ^ j > result) {
|
||||
result = max ^ j;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
#include "scalar_div.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool is_divisor_power_of_two,
|
||||
bool log2_divisor_exceeds_threshold, bool multiplier_exceeds_threshold,
|
||||
uint32_t num_scalar_bits, uint32_t ilog2_divisor, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
return scratch_integer_unsigned_scalar_div_radix<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
|
||||
(int_unsigned_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
|
||||
allocate_gpu_memory, is_divisor_power_of_two,
|
||||
log2_divisor_exceeds_threshold, multiplier_exceeds_threshold,
|
||||
num_scalar_bits, ilog2_divisor);
|
||||
}
|
||||
|
||||
void cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *ksks,
|
||||
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_scalars, bool multiplier_exceeds_threshold,
|
||||
bool is_divisor_power_of_two, bool log2_divisor_exceeds_threshold,
|
||||
uint32_t ilog2_divisor, uint64_t shift_pre, uint32_t shift_post,
|
||||
uint64_t rhs) {
|
||||
|
||||
host_integer_unsigned_scalar_div_radix<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, numerator_ct,
|
||||
(int_unsigned_scalar_div_mem<uint64_t> *)mem_ptr, (uint64_t **)ksks,
|
||||
decomposed_scalar, has_at_least_one_set, ms_noise_reduction_key, bsks,
|
||||
num_scalars, multiplier_exceeds_threshold, is_divisor_power_of_two,
|
||||
log2_divisor_exceeds_threshold, ilog2_divisor, shift_pre, shift_post,
|
||||
rhs);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_unsigned_scalar_div_mem<uint64_t> *mem_ptr =
|
||||
(int_unsigned_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
|
||||
}
|
||||
@@ -1,121 +0,0 @@
|
||||
#ifndef SCALAR_DIV_CUH
|
||||
#define SCALAR_DIV_CUH
|
||||
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "integer/scalar_mul.cuh"
|
||||
#include "integer/scalar_shifts.cuh"
|
||||
#include "integer/subtraction.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_integer_unsigned_scalar_div_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, const int_radix_params params,
|
||||
int_unsigned_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
const bool allocate_gpu_memory, bool is_divisor_power_of_two,
|
||||
bool log2_divisor_exceeds_threshold, bool multiplier_exceeds_threshold,
|
||||
uint32_t num_scalar_bits, uint32_t ilog2_divisor) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_unsigned_scalar_div_mem<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
allocate_gpu_memory, is_divisor_power_of_two,
|
||||
log2_divisor_exceeds_threshold, multiplier_exceeds_threshold,
|
||||
ilog2_divisor, num_scalar_bits, &size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_unsigned_scalar_div_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int_unsigned_scalar_div_mem<Torus> *mem_ptr, Torus *const *ksks,
|
||||
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_scalars, bool multiplier_exceeds_threshold,
|
||||
bool is_divisor_power_of_two, bool log2_divisor_exceeds_threshold,
|
||||
uint32_t ilog2_divisor, uint64_t shift_pre, uint32_t shift_post,
|
||||
uint64_t rhs) {
|
||||
|
||||
if (ilog2_divisor == (uint32_t)0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_divisor_power_of_two) {
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, ilog2_divisor,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
numerator_ct->num_radix_blocks);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (log2_divisor_exceeds_threshold) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
|
||||
mem_ptr->tmp_ffi);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (multiplier_exceeds_threshold) {
|
||||
|
||||
if (shift_pre != (uint64_t)0) {
|
||||
PANIC("shift_pre should be == 0");
|
||||
}
|
||||
|
||||
if (shift_post == (uint32_t)0) {
|
||||
PANIC("shift_post should be > 0");
|
||||
}
|
||||
|
||||
CudaRadixCiphertextFFI *numerator_cpy = mem_ptr->tmp_ffi;
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
numerator_cpy, numerator_ct);
|
||||
|
||||
host_integer_radix_scalar_mul_high_kb(
|
||||
streams, gpu_indexes, gpu_count, numerator_cpy,
|
||||
mem_ptr->scalar_mul_high_mem, ksks, rhs, decomposed_scalar,
|
||||
has_at_least_one_set, ms_noise_reduction_key, bsks, num_scalars);
|
||||
|
||||
host_sub_and_propagate_single_carry(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, numerator_cpy, nullptr,
|
||||
nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, (uint32_t)1,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
numerator_ct->num_radix_blocks);
|
||||
|
||||
host_add_and_propagate_single_carry(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, numerator_cpy, nullptr,
|
||||
nullptr, mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
FLAG_NONE, (uint32_t)0);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, shift_post - (uint32_t)1,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
numerator_ct->num_radix_blocks);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, shift_pre,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
numerator_ct->num_radix_blocks);
|
||||
|
||||
host_integer_radix_scalar_mul_high_kb(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct,
|
||||
mem_ptr->scalar_mul_high_mem, ksks, rhs, decomposed_scalar,
|
||||
has_at_least_one_set, ms_noise_reduction_key, bsks, num_scalars);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, shift_post,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
numerator_ct->num_radix_blocks);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -6,8 +6,7 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
@@ -18,7 +17,7 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
|
||||
return scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
num_scalar_bits, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "cast.cuh"
|
||||
#include "device.h"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "multiplication.cuh"
|
||||
@@ -33,12 +32,12 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_mul_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_scalar_mul_buffer<T> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
uint32_t num_scalar_bits, bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_scalar_mul_buffer<T>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
num_scalar_bits, allocate_gpu_memory, true, &size_tracker);
|
||||
allocate_gpu_memory, true, &size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -116,10 +115,13 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
set_zero_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
|
||||
lwe_array, 0, num_radix_blocks);
|
||||
} else {
|
||||
for (int i = 0; i < j * num_radix_blocks; i++) {
|
||||
all_shifted_buffer->degrees[i] = message_modulus - 1;
|
||||
}
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
|
||||
ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
|
||||
num_radix_blocks, j);
|
||||
num_radix_blocks, j, nullptr);
|
||||
|
||||
auto scp_mem_ptr = mem->sc_prop_mem;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
@@ -168,91 +170,4 @@ __host__ void host_integer_small_scalar_mul_radix(
|
||||
output_lwe_array->degrees[i] = input_lwe_array->degrees[i] * scalar;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_mul_high_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
|
||||
int_scalar_mul_high<Torus> *mem_ptr, Torus *const *ksks, uint64_t rhs,
|
||||
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_scalars) {
|
||||
|
||||
if (rhs == (uint64_t)0) {
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], ct,
|
||||
0, ct->num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
CudaRadixCiphertextFFI *tmp_ffi = mem_ptr->tmp;
|
||||
|
||||
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(tmp_ffi, ct, streams,
|
||||
gpu_indexes);
|
||||
|
||||
if (rhs != (uint64_t)1 || tmp_ffi->num_radix_blocks != 0) {
|
||||
if ((rhs & (rhs - 1)) == 0) {
|
||||
|
||||
uint32_t shift = std::log2(rhs);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi, shift,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
|
||||
|
||||
} else {
|
||||
|
||||
switch (mem_ptr->params.polynomial_size) {
|
||||
case 512:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
|
||||
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key,
|
||||
mem_ptr->params.message_modulus, num_scalars);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
|
||||
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key,
|
||||
mem_ptr->params.message_modulus, num_scalars);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
|
||||
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key,
|
||||
mem_ptr->params.message_modulus, num_scalars);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
|
||||
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key,
|
||||
mem_ptr->params.message_modulus, num_scalars);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
|
||||
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key,
|
||||
mem_ptr->params.message_modulus, num_scalars);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
|
||||
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key,
|
||||
mem_ptr->params.message_modulus, num_scalars);
|
||||
break;
|
||||
default:
|
||||
PANIC(
|
||||
"Cuda error (scalar multiplication): unsupported polynomial size. "
|
||||
"Only N = 512, 1024, 2048, 4096, 8192, 16384 are supported.")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams, gpu_indexes);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
#include "subtraction.cuh"
|
||||
|
||||
uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, allocate_ms_array);
|
||||
|
||||
return scratch_cuda_sub_and_propagate_single_carry<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_sub_and_propagate<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
requested_flag, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
|
||||
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
|
||||
host_sub_and_propagate_single_carry<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lhs_array, rhs_array,
|
||||
carry_out, carry_in, (int_sub_and_propagate<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
}
|
||||
|
||||
void cleanup_cuda_sub_and_propagate_single_carry(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_sub_and_propagate<uint64_t> *mem_ptr =
|
||||
(int_sub_and_propagate<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
|
||||
}
|
||||
@@ -8,46 +8,7 @@
|
||||
|
||||
#include "device.h"
|
||||
#include "integer/integer.h"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "negation.cuh"
|
||||
#include "pbs/pbs_enums.h"
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t scratch_cuda_sub_and_propagate_single_carry(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_sub_and_propagate<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_sub_and_propagate<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_sub_and_propagate_single_carry(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *input_carries,
|
||||
int_sub_and_propagate<Torus> *mem, void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
|
||||
host_integer_radix_negation<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem->neg_rhs_array, rhs_array,
|
||||
mem->params.message_modulus, mem->params.carry_modulus,
|
||||
mem->neg_rhs_array->num_radix_blocks);
|
||||
|
||||
host_add_and_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lhs_array, mem->neg_rhs_array, carry_out,
|
||||
input_carries, mem->sc_prop_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
requested_flag, uses_carry);
|
||||
}
|
||||
#include "linear_algebra.h"
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_subtraction(
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
#ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_128_CUH
|
||||
#define CUDA_PROGRAMMABLE_BOOTSTRAP_128_CUH
|
||||
#include "pbs/pbs_128_utilities.h"
|
||||
|
||||
static void
|
||||
execute_scratch_pbs_128(void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array,
|
||||
uint64_t *size_tracker_on_gpu) {
|
||||
// The squash noise function receives as input 64-bit integers
|
||||
*size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
|
||||
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory, allocate_ms_array);
|
||||
}
|
||||
template <typename Torus>
|
||||
static void execute_pbs_128_async(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, const LweArrayVariant<__uint128_t> &lwe_array_out,
|
||||
const std::vector<Torus *> lut_vector,
|
||||
const LweArrayVariant<uint64_t> &lwe_array_in,
|
||||
void *const *bootstrapping_keys,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
std::vector<int8_t *> pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
for (uint32_t i = 0; i < gpu_count; i++) {
|
||||
int num_inputs_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
|
||||
|
||||
Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
|
||||
uint64_t *current_lwe_array_in = GET_VARIANT_ELEMENT_64BIT(lwe_array_in, i);
|
||||
void *zeros = nullptr;
|
||||
if (ms_noise_reduction_key != nullptr)
|
||||
zeros = ms_noise_reduction_key->ptr[i];
|
||||
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out, lut_vector[i],
|
||||
current_lwe_array_in, bootstrapping_keys[i], ms_noise_reduction_key,
|
||||
zeros, pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size,
|
||||
base_log, level_count, num_inputs_on_gpu);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -46,7 +46,7 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, uint32_t num_many_lut,
|
||||
uint32_t lut_stride, bool uses_noise_reduction) {
|
||||
uint32_t lut_stride) {
|
||||
|
||||
grid_group grid = this_grid();
|
||||
|
||||
@@ -80,9 +80,7 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const Torus *block_lwe_array_in =
|
||||
uses_noise_reduction
|
||||
? &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]
|
||||
: &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
|
||||
@@ -265,9 +263,7 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
|
||||
|
||||
bool uses_noise_reduction = buffer->uses_noise_reduction;
|
||||
|
||||
void *kernel_args[17];
|
||||
void *kernel_args[16];
|
||||
kernel_args[0] = &lwe_array_out;
|
||||
kernel_args[1] = &lwe_output_indexes;
|
||||
kernel_args[2] = &lut_vector;
|
||||
@@ -283,7 +279,6 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
kernel_args[12] = &d_mem;
|
||||
kernel_args[14] = &num_many_lut;
|
||||
kernel_args[15] = &lut_stride;
|
||||
kernel_args[16] = &uses_noise_reduction;
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
kernel_args[13] = &full_dm;
|
||||
|
||||
@@ -660,17 +660,24 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
|
||||
|
||||
// If the parameters contain noise reduction key, then apply it
|
||||
if (buffer->uses_noise_reduction) {
|
||||
uint32_t log_modulus = log2(polynomial_size) + 1;
|
||||
host_improve_noise_modulus_switch<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer->temp_lwe_array_in,
|
||||
static_cast<uint64_t const *>(lwe_array_in),
|
||||
static_cast<uint64_t const *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(ms_noise_reduction_ptr), lwe_dimension + 1,
|
||||
num_samples, ms_noise_reduction_key->num_zeros,
|
||||
ms_noise_reduction_key->ms_input_variance,
|
||||
ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
|
||||
log_modulus);
|
||||
if (ms_noise_reduction_key != nullptr &&
|
||||
ms_noise_reduction_key->ptr != nullptr) {
|
||||
if (ms_noise_reduction_key->num_zeros != 0) {
|
||||
uint32_t log_modulus = log2(polynomial_size) + 1;
|
||||
host_improve_noise_modulus_switch<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
buffer->temp_lwe_array_in,
|
||||
static_cast<uint64_t const *>(lwe_array_in),
|
||||
static_cast<uint64_t const *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(ms_noise_reduction_ptr), lwe_dimension + 1,
|
||||
num_samples, ms_noise_reduction_key->num_zeros,
|
||||
ms_noise_reduction_key->ms_input_variance,
|
||||
ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
|
||||
log_modulus);
|
||||
} else {
|
||||
buffer->temp_lwe_array_in =
|
||||
const_cast<uint64_t *>(static_cast<const uint64_t *>(lwe_array_in));
|
||||
}
|
||||
} else {
|
||||
buffer->temp_lwe_array_in =
|
||||
const_cast<uint64_t *>(static_cast<const uint64_t *>(lwe_array_in));
|
||||
|
||||
@@ -27,7 +27,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
double2 *global_join_buffer, uint32_t lwe_iteration,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, bool uses_noise_reduction) {
|
||||
uint64_t device_memory_size_per_block) {
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
@@ -55,9 +55,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const Torus *block_lwe_array_in =
|
||||
uses_noise_reduction
|
||||
? &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]
|
||||
: &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
|
||||
@@ -399,8 +397,7 @@ __host__ void execute_step_one(
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
|
||||
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
|
||||
bool uses_noise_reduction) {
|
||||
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
@@ -413,21 +410,20 @@ __host__ void execute_step_one(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
global_accumulator, global_join_buffer, lwe_iteration,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem,
|
||||
full_dm, uses_noise_reduction);
|
||||
full_dm);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, first_iter>
|
||||
<<<grid, thds, partial_sm, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
global_accumulator, global_join_buffer, lwe_iteration,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem,
|
||||
partial_dm, uses_noise_reduction);
|
||||
partial_dm);
|
||||
} else {
|
||||
device_programmable_bootstrap_step_one<Torus, params, FULLSM, first_iter>
|
||||
<<<grid, thds, full_sm, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
global_accumulator, global_join_buffer, lwe_iteration,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
|
||||
uses_noise_reduction);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -508,7 +504,6 @@ __host__ void host_programmable_bootstrap(
|
||||
Torus *global_accumulator = pbs_buffer->global_accumulator;
|
||||
double2 *global_join_buffer = pbs_buffer->global_join_buffer;
|
||||
int8_t *d_mem = pbs_buffer->d_mem;
|
||||
bool uses_noise_reduction = pbs_buffer->uses_noise_reduction;
|
||||
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
if (i == 0) {
|
||||
@@ -517,16 +512,14 @@ __host__ void host_programmable_bootstrap(
|
||||
lwe_input_indexes, bootstrapping_key, global_accumulator,
|
||||
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
|
||||
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one,
|
||||
uses_noise_reduction);
|
||||
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
|
||||
} else {
|
||||
execute_step_one<Torus, params, false>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, global_accumulator,
|
||||
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
|
||||
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one,
|
||||
uses_noise_reduction);
|
||||
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
|
||||
}
|
||||
if (i == lwe_dimension - 1) {
|
||||
execute_step_two<Torus, params, true>(
|
||||
|
||||
@@ -8,173 +8,178 @@ bool has_support_to_cuda_programmable_bootstrap_128_cg(
|
||||
max_shared_memory);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_programmable_bootstrap_128_vector_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
return scratch_cuda_programmable_bootstrap_128_vector<uint64_t>(
|
||||
stream, gpu_index,
|
||||
(pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> **)pbs_buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the PBS on 128 bits inputs, into `buffer`. It also configures SM options on
|
||||
* the GPU in case FULLSM or PARTIALSM mode is going to be used.
|
||||
*/
|
||||
uint64_t scratch_cuda_programmable_bootstrap_128(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
return scratch_cuda_programmable_bootstrap_128_vector_64(
|
||||
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory, allocate_ms_array);
|
||||
}
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto buffer = (pbs_buffer_128<CLASSICAL> **)pbs_buffer;
|
||||
|
||||
template <typename InputTorus>
|
||||
void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
__uint128_t const *lut_vector, InputTorus *lwe_array_in,
|
||||
double const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS128): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputTorus>
|
||||
void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
__uint128_t const *lut_vector, InputTorus *lwe_array_in,
|
||||
double const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS128): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputTorus>
|
||||
void host_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
__uint128_t const *lut_vector, void const *lwe_array_in,
|
||||
void const *bootstrapping_key,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void const *ms_noise_reduction_ptr,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (classical PBS): base log should be <= 64")
|
||||
|
||||
// If the parameters contain noise reduction key, then apply it
|
||||
if (ms_noise_reduction_key->num_zeros != 0) {
|
||||
uint32_t log_modulus = log2(polynomial_size) + 1;
|
||||
host_improve_noise_modulus_switch<InputTorus>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<InputTorus *>(buffer->temp_lwe_array_in),
|
||||
static_cast<InputTorus const *>(lwe_array_in),
|
||||
static_cast<uint64_t const *>(buffer->trivial_indexes),
|
||||
static_cast<const InputTorus *>(ms_noise_reduction_ptr),
|
||||
lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
|
||||
ms_noise_reduction_key->ms_input_variance,
|
||||
ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
|
||||
log_modulus);
|
||||
if (has_support_to_cuda_programmable_bootstrap_128_cg(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory)) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_programmable_bootstrap_cg_128<AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
case 512:
|
||||
return scratch_programmable_bootstrap_cg_128<AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
case 1024:
|
||||
return scratch_programmable_bootstrap_cg_128<AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
case 2048:
|
||||
return scratch_programmable_bootstrap_cg_128<AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
case 4096:
|
||||
return scratch_programmable_bootstrap_cg_128<AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS128): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
} else {
|
||||
buffer->temp_lwe_array_in =
|
||||
const_cast<InputTorus *>(static_cast<const InputTorus *>(lwe_array_in));
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_programmable_bootstrap_128<AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
case 512:
|
||||
return scratch_programmable_bootstrap_128<AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
case 1024:
|
||||
return scratch_programmable_bootstrap_128<AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
case 2048:
|
||||
return scratch_programmable_bootstrap_128<AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
case 4096:
|
||||
return scratch_programmable_bootstrap_128<AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
switch (buffer->pbs_variant) {
|
||||
case DEFAULT:
|
||||
executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<InputTorus>(
|
||||
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
lut_vector, static_cast<InputTorus *>(buffer->temp_lwe_array_in),
|
||||
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lut_vector, Torus *lwe_array_in,
|
||||
double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap_128<AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case CG:
|
||||
executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128<
|
||||
InputTorus>(
|
||||
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
lut_vector, static_cast<InputTorus *>(buffer->temp_lwe_array_in),
|
||||
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
case 512:
|
||||
host_programmable_bootstrap_128<AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_128<AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_128<AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_128<AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unknown pbs variant.")
|
||||
PANIC("Cuda error (classical PBS128): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lut_vector, Torus *lwe_array_in,
|
||||
double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap_cg_128<AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_cg_128<AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_cg_128<AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_cg_128<AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_cg_128<AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS128): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -232,22 +237,57 @@ void host_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
*/
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *streams, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lut_vector, void const *lwe_array_in,
|
||||
void const *bootstrapping_key,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void const *ms_noise_reduction_ptr, int8_t *mem_ptr, uint32_t lwe_dimension,
|
||||
void *ms_noise_reduction_ptr, int8_t *mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> *buffer =
|
||||
(pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> *)mem_ptr;
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (classical PBS): base log should be <= 64")
|
||||
|
||||
host_programmable_bootstrap_lwe_ciphertext_vector_128<uint64_t>(
|
||||
streams, gpu_index, lwe_array_out,
|
||||
static_cast<const __uint128_t *>(lut_vector), lwe_array_in,
|
||||
bootstrapping_key, ms_noise_reduction_key, ms_noise_reduction_ptr, buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples);
|
||||
pbs_buffer_128<CLASSICAL> *buffer = (pbs_buffer_128<CLASSICAL> *)mem_ptr;
|
||||
|
||||
// If the parameters contain noise reduction key, then apply it
|
||||
if (ms_noise_reduction_key->num_zeros != 0) {
|
||||
uint32_t log_modulus = log2(polynomial_size) + 1;
|
||||
host_improve_noise_modulus_switch<__uint128_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
|
||||
static_cast<__uint128_t const *>(lwe_array_in),
|
||||
static_cast<uint64_t const *>(buffer->trivial_indexes),
|
||||
static_cast<const __uint128_t *>(ms_noise_reduction_ptr),
|
||||
lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
|
||||
ms_noise_reduction_key->ms_input_variance,
|
||||
ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
|
||||
log_modulus);
|
||||
} else {
|
||||
buffer->temp_lwe_array_in = const_cast<__uint128_t *>(
|
||||
static_cast<const __uint128_t *>(lwe_array_in));
|
||||
}
|
||||
|
||||
switch (buffer->pbs_variant) {
|
||||
case DEFAULT:
|
||||
executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<__uint128_t>(
|
||||
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<const __uint128_t *>(lut_vector),
|
||||
static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
|
||||
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case CG:
|
||||
executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128<
|
||||
__uint128_t>(
|
||||
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<const __uint128_t *>(lut_vector),
|
||||
static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
|
||||
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unknown pbs variant.")
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -256,6 +296,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
*/
|
||||
void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
|
||||
int8_t **buffer) {
|
||||
auto x = (pbs_buffer_128<__uint128_t, PBS_TYPE::CLASSICAL> *)(*buffer);
|
||||
auto x = (pbs_buffer_128<CLASSICAL> *)(*buffer);
|
||||
x->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
@@ -74,17 +74,16 @@ __device__ void mul_ggsw_glwe_in_fourier_domain_128(
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename InputTorus, class params, sharedMemDegree SMD,
|
||||
bool first_iter>
|
||||
template <typename Torus, class params, sharedMemDegree SMD, bool first_iter>
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_programmable_bootstrap_step_one_128(
|
||||
const __uint128_t *__restrict__ lut_vector,
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const double *__restrict__ bootstrapping_key,
|
||||
__uint128_t *global_accumulator, double *global_join_buffer,
|
||||
uint32_t lwe_iteration, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block) {
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const double *__restrict__ bootstrapping_key, Torus *global_accumulator,
|
||||
double *global_join_buffer, uint32_t lwe_iteration,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block) {
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
@@ -101,22 +100,22 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
selected_memory = &device_mem[block_index * device_memory_size_per_block];
|
||||
}
|
||||
|
||||
__uint128_t *accumulator = (__uint128_t *)selected_memory;
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
double *accumulator_fft =
|
||||
(double *)accumulator +
|
||||
(ptrdiff_t)(sizeof(__uint128_t) * polynomial_size / sizeof(double));
|
||||
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double));
|
||||
|
||||
if constexpr (SMD == PARTIALSM)
|
||||
accumulator_fft = (double *)sharedmem;
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const InputTorus *block_lwe_array_in =
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[blockIdx.x * (lwe_dimension + 1)];
|
||||
|
||||
const __uint128_t *block_lut_vector = lut_vector;
|
||||
const Torus *block_lut_vector = lut_vector;
|
||||
|
||||
__uint128_t *global_slice =
|
||||
Torus *global_slice =
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
|
||||
|
||||
@@ -128,12 +127,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
if constexpr (first_iter) {
|
||||
// First iteration
|
||||
// Put "b" in [0, 2N[
|
||||
InputTorus b_hat = 0;
|
||||
modulus_switch<InputTorus>(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
Torus b_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
// The y-dimension is used to select the element of the GLWE this block will
|
||||
// compute
|
||||
divide_by_monomial_negacyclic_inplace<__uint128_t, params::opt,
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
@@ -147,21 +146,20 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
}
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
InputTorus a_hat = 0;
|
||||
modulus_switch<InputTorus>(block_lwe_array_in[lwe_iteration], a_hat,
|
||||
params::log2_degree +
|
||||
1); // 2 * params::log2_degree + 1);
|
||||
Torus a_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
|
||||
params::log2_degree + 1); // 2 * params::log2_degree + 1);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
__uint128_t, params::opt, params::degree / params::opt>(
|
||||
global_slice, accumulator, a_hat);
|
||||
Torus, params::opt, params::degree / params::opt>(global_slice,
|
||||
accumulator, a_hat);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
init_decomposer_state_inplace<__uint128_t, params::opt,
|
||||
init_decomposer_state_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
|
||||
@@ -170,8 +168,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<__uint128_t, params> gadget_acc(base_log, level_count,
|
||||
accumulator);
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
@@ -317,10 +314,10 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
*
|
||||
* Each y-block computes one element of the lwe_array_out.
|
||||
*/
|
||||
template <typename InputTorus, class params, sharedMemDegree SMD>
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_programmable_bootstrap_cg_128(
|
||||
__uint128_t *lwe_array_out, const __uint128_t *__restrict__ lut_vector,
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const double *__restrict__ bootstrapping_key, double *join_buffer,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *device_mem,
|
||||
@@ -345,22 +342,23 @@ __global__ void device_programmable_bootstrap_cg_128(
|
||||
|
||||
// We always compute the pointer with most restrictive alignment to avoid
|
||||
// alignment issues
|
||||
__uint128_t *accumulator = (__uint128_t *)selected_memory;
|
||||
__uint128_t *accumulator_rotated =
|
||||
(__uint128_t *)accumulator + (ptrdiff_t)(polynomial_size);
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
Torus *accumulator_rotated =
|
||||
(Torus *)accumulator + (ptrdiff_t)(polynomial_size);
|
||||
double *accumulator_fft =
|
||||
(double *)(accumulator_rotated) +
|
||||
(ptrdiff_t)(polynomial_size * sizeof(__uint128_t) / sizeof(double));
|
||||
(ptrdiff_t)(polynomial_size * sizeof(Torus) / sizeof(double));
|
||||
|
||||
if constexpr (SMD == PARTIALSM)
|
||||
accumulator_fft = (double *)sharedmem;
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const InputTorus *block_lwe_array_in =
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[blockIdx.x * (lwe_dimension + 1)];
|
||||
|
||||
const __uint128_t *block_lut_vector = lut_vector;
|
||||
const Torus *block_lut_vector =
|
||||
&lut_vector[blockIdx.x * params::degree * (glwe_dimension + 1)];
|
||||
|
||||
double *block_join_buffer =
|
||||
&join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
|
||||
@@ -370,11 +368,11 @@ __global__ void device_programmable_bootstrap_cg_128(
|
||||
// rotated array is not in use anymore by the time we perform the fft
|
||||
|
||||
// Put "b" in [0, 2N[
|
||||
InputTorus b_hat = 0;
|
||||
modulus_switch<InputTorus>(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
Torus b_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<__uint128_t, params::opt,
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
@@ -383,18 +381,17 @@ __global__ void device_programmable_bootstrap_cg_128(
|
||||
__syncthreads();
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
InputTorus a_hat = 0;
|
||||
modulus_switch<InputTorus>(block_lwe_array_in[i], a_hat,
|
||||
params::log2_degree + 1);
|
||||
Torus a_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
__uint128_t, params::opt, params::degree / params::opt>(
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, accumulator_rotated, a_hat);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
init_decomposer_state_inplace<__uint128_t, params::opt,
|
||||
init_decomposer_state_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, level_count);
|
||||
|
||||
@@ -403,8 +400,8 @@ __global__ void device_programmable_bootstrap_cg_128(
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<__uint128_t, params> gadget_acc(base_log, level_count,
|
||||
accumulator_rotated);
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
|
||||
accumulator_rotated);
|
||||
gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
|
||||
|
||||
auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
|
||||
@@ -423,9 +420,8 @@ __global__ void device_programmable_bootstrap_cg_128(
|
||||
acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi, acc_fft_im_lo);
|
||||
__syncthreads();
|
||||
|
||||
add_to_torus_128<__uint128_t, params>(acc_fft_re_hi, acc_fft_re_lo,
|
||||
acc_fft_im_hi, acc_fft_im_lo,
|
||||
accumulator);
|
||||
add_to_torus_128<Torus, params>(acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi,
|
||||
acc_fft_im_lo, accumulator);
|
||||
}
|
||||
|
||||
auto block_lwe_array_out =
|
||||
@@ -437,20 +433,17 @@ __global__ void device_programmable_bootstrap_cg_128(
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<__uint128_t, params>(block_lwe_array_out,
|
||||
accumulator);
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<__uint128_t, params>(block_lwe_array_out, accumulator,
|
||||
0);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputTorus, typename params>
|
||||
template <typename params>
|
||||
__host__ uint64_t scratch_programmable_bootstrap_cg_128(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **buffer,
|
||||
cudaStream_t stream, uint32_t gpu_index, pbs_buffer_128<CLASSICAL> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
@@ -464,34 +457,33 @@ __host__ uint64_t scratch_programmable_bootstrap_cg_128(
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_cg_128<InputTorus, params, PARTIALSM>,
|
||||
device_programmable_bootstrap_cg_128<__uint128_t, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_cg_128<InputTorus, params, PARTIALSM>,
|
||||
device_programmable_bootstrap_cg_128<__uint128_t, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_cg_128<InputTorus, params, FULLSM>,
|
||||
device_programmable_bootstrap_cg_128<__uint128_t, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_cg_128<InputTorus, params, FULLSM>,
|
||||
device_programmable_bootstrap_cg_128<__uint128_t, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
|
||||
*buffer = new pbs_buffer_128<CLASSICAL>(
|
||||
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, PBS_VARIANT::CG,
|
||||
allocate_gpu_memory, allocate_ms_array, &size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename InputTorus, typename params>
|
||||
template <typename params>
|
||||
__host__ uint64_t scratch_programmable_bootstrap_128(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **buffer,
|
||||
cudaStream_t stream, uint32_t gpu_index, pbs_buffer_128<CLASSICAL> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
@@ -512,37 +504,37 @@ __host__ uint64_t scratch_programmable_bootstrap_128(
|
||||
// Configure step one
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params,
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params,
|
||||
PARTIALSM, true>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params,
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params,
|
||||
PARTIALSM, true>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params,
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params,
|
||||
PARTIALSM, false>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params,
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params,
|
||||
PARTIALSM, false>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
|
||||
true>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
|
||||
true>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
|
||||
false>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
|
||||
false>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
@@ -588,122 +580,17 @@ __host__ uint64_t scratch_programmable_bootstrap_128(
|
||||
}
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
|
||||
*buffer = new pbs_buffer_128<CLASSICAL>(
|
||||
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT,
|
||||
allocate_gpu_memory, allocate_ms_array, &size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the PBS on 128 bits inputs, into `buffer`. It also configures SM options on
|
||||
* the GPU in case FULLSM or PARTIALSM mode is going to be used.
|
||||
*/
|
||||
template <typename InputTorus>
|
||||
uint64_t scratch_cuda_programmable_bootstrap_128_vector(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto buffer = (pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **)pbs_buffer;
|
||||
|
||||
if (has_support_to_cuda_programmable_bootstrap_128_cg(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory)) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 512:
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 1024:
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 2048:
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 4096:
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS128): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
} else {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 512:
|
||||
return scratch_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 1024:
|
||||
return scratch_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 2048:
|
||||
return scratch_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 4096:
|
||||
return scratch_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputTorus, class params, bool first_iter>
|
||||
template <class params, bool first_iter>
|
||||
__host__ void execute_step_one_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
|
||||
InputTorus *lwe_array_in, double const *bootstrapping_key,
|
||||
__uint128_t *lwe_array_in, double const *bootstrapping_key,
|
||||
__uint128_t *global_accumulator, double *global_join_buffer,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
@@ -716,21 +603,21 @@ __host__ void execute_step_one_128(
|
||||
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params, NOSM,
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, NOSM,
|
||||
first_iter>
|
||||
<<<grid, thds, 0, stream>>>(
|
||||
lut_vector, lwe_array_in, bootstrapping_key, global_accumulator,
|
||||
global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size,
|
||||
base_log, level_count, d_mem, full_dm);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params, PARTIALSM,
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, PARTIALSM,
|
||||
first_iter>
|
||||
<<<grid, thds, partial_sm, stream>>>(
|
||||
lut_vector, lwe_array_in, bootstrapping_key, global_accumulator,
|
||||
global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size,
|
||||
base_log, level_count, d_mem, partial_dm);
|
||||
} else {
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
|
||||
first_iter>
|
||||
<<<grid, thds, full_sm, stream>>>(
|
||||
lut_vector, lwe_array_in, bootstrapping_key, global_accumulator,
|
||||
@@ -783,12 +670,11 @@ __host__ void execute_step_two_128(
|
||||
/*
|
||||
* Host wrapper to the programmable bootstrap 128
|
||||
*/
|
||||
template <typename InputTorus, class params>
|
||||
template <class params>
|
||||
__host__ void host_programmable_bootstrap_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
__uint128_t const *lut_vector, InputTorus *lwe_array_in,
|
||||
double const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *pbs_buffer,
|
||||
__uint128_t const *lut_vector, __uint128_t *lwe_array_in,
|
||||
double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
@@ -818,14 +704,14 @@ __host__ void host_programmable_bootstrap_128(
|
||||
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
if (i == 0) {
|
||||
execute_step_one_128<InputTorus, params, true>(
|
||||
execute_step_one_128<params, true>(
|
||||
stream, gpu_index, lut_vector, lwe_array_in, bootstrapping_key,
|
||||
global_accumulator, global_join_buffer, input_lwe_ciphertext_count,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
|
||||
d_mem, i, partial_sm, partial_dm_step_one, full_sm_step_one,
|
||||
full_dm_step_one);
|
||||
} else {
|
||||
execute_step_one_128<InputTorus, params, false>(
|
||||
execute_step_one_128<params, false>(
|
||||
stream, gpu_index, lut_vector, lwe_array_in, bootstrapping_key,
|
||||
global_accumulator, global_join_buffer, input_lwe_ciphertext_count,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
|
||||
@@ -850,12 +736,11 @@ __host__ void host_programmable_bootstrap_128(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputTorus, class params>
|
||||
template <class params>
|
||||
__host__ void host_programmable_bootstrap_cg_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
__uint128_t const *lut_vector, InputTorus const *lwe_array_in,
|
||||
double const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
|
||||
__uint128_t const *lut_vector, __uint128_t const *lwe_array_in,
|
||||
double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
@@ -898,20 +783,20 @@ __host__ void host_programmable_bootstrap_cg_128(
|
||||
if (max_shared_memory < partial_sm) {
|
||||
kernel_args[10] = &full_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_programmable_bootstrap_cg_128<InputTorus, params, NOSM>,
|
||||
(void *)device_programmable_bootstrap_cg_128<__uint128_t, params, NOSM>,
|
||||
grid, thds, (void **)kernel_args, 0, stream));
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
kernel_args[10] = &partial_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)
|
||||
device_programmable_bootstrap_cg_128<InputTorus, params, PARTIALSM>,
|
||||
(void *)device_programmable_bootstrap_cg_128<__uint128_t, params,
|
||||
PARTIALSM>,
|
||||
grid, thds, (void **)kernel_args, partial_sm, stream));
|
||||
} else {
|
||||
int no_dm = 0;
|
||||
kernel_args[10] = &no_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)
|
||||
device_programmable_bootstrap_cg_128<InputTorus, params, FULLSM>,
|
||||
device_programmable_bootstrap_cg_128<__uint128_t, params, FULLSM>,
|
||||
grid, thds, (void **)kernel_args, full_sm, stream));
|
||||
}
|
||||
|
||||
|
||||
@@ -398,32 +398,20 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
bool supports_cg =
|
||||
supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index));
|
||||
#if (CUDA_ARCH >= 900)
|
||||
// On H100s we should be using TBC until num_samples < num_sms / 2.
|
||||
// After that we switch to CG until not supported anymore.
|
||||
// At this point we return to TBC.
|
||||
int num_sms = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
|
||||
|
||||
bool supports_tbc =
|
||||
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
level_count, cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
if (supports_tbc &&
|
||||
!(input_lwe_ciphertext_count > num_sms / 2 && supports_cg))
|
||||
level_count, cuda_get_max_shared_memory(gpu_index)))
|
||||
return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
else
|
||||
#endif
|
||||
if (supports_cg)
|
||||
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
|
||||
uint64_t>(glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(gpu_index)))
|
||||
return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
|
||||
@@ -46,7 +46,7 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, bool support_dsm,
|
||||
uint32_t num_many_lut, uint32_t lut_stride, bool uses_noise_reduction) {
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
cluster_group cluster = this_cluster();
|
||||
|
||||
@@ -83,9 +83,7 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const Torus *block_lwe_array_in =
|
||||
uses_noise_reduction
|
||||
? &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]
|
||||
: &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
|
||||
@@ -296,7 +294,7 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
|
||||
int8_t *d_mem = buffer->d_mem;
|
||||
double2 *buffer_fft = buffer->global_join_buffer;
|
||||
bool uses_noise_reduction = buffer->uses_noise_reduction;
|
||||
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
|
||||
|
||||
@@ -324,7 +322,7 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
|
||||
supports_dsm, num_many_lut, lut_stride, uses_noise_reduction));
|
||||
supports_dsm, num_many_lut, lut_stride));
|
||||
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
|
||||
config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;
|
||||
|
||||
@@ -333,8 +331,7 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem,
|
||||
partial_dm, supports_dsm, num_many_lut, lut_stride,
|
||||
uses_noise_reduction));
|
||||
partial_dm, supports_dsm, num_many_lut, lut_stride));
|
||||
} else {
|
||||
config.dynamicSmemBytes = full_sm + minimum_sm_tbc;
|
||||
|
||||
@@ -343,7 +340,7 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
|
||||
supports_dsm, num_many_lut, lut_stride, buffer->uses_noise_reduction));
|
||||
supports_dsm, num_many_lut, lut_stride));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -38,19 +38,17 @@ template <typename T> void print_debug(const char *name, const T *src, int N) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void print_body_kernel(T *src, int N, int lwe_dimension, T delta) {
|
||||
__global__ void print_body_kernel(T *src, int N, int lwe_dimension) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
T body = src[i * (lwe_dimension + 1) + lwe_dimension];
|
||||
T clear = body / delta;
|
||||
printf("(%lu, %lu), ", body, clear);
|
||||
printf("%lu, ", src[i * (lwe_dimension + 1) + lwe_dimension]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void print_body(const char *name, T *src, int n, int lwe_dimension, T delta) {
|
||||
void print_body(const char *name, T *src, int n, int lwe_dimension) {
|
||||
printf("%s: ", name);
|
||||
cudaDeviceSynchronize();
|
||||
print_body_kernel<<<1, 1>>>(src, n, lwe_dimension, delta);
|
||||
print_body_kernel<<<1, 1>>>(src, n, lwe_dimension);
|
||||
cudaDeviceSynchronize();
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ void multi_gpu_alloc_array_async(cudaStream_t const *streams,
|
||||
&size_tracker_on_gpu_i, allocate_gpu_memory);
|
||||
dest[i] = d_array;
|
||||
if (i == 0 && size_tracker_on_gpu_0 != nullptr) {
|
||||
*size_tracker_on_gpu_0 += size_tracker_on_gpu_i;
|
||||
*size_tracker_on_gpu_0 = size_tracker_on_gpu_i;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -57,17 +57,11 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
|
||||
&size_tracker_on_gpu_i, allocate_gpu_memory);
|
||||
dest[i] = d_array;
|
||||
if (i == 0 && size_tracker_on_gpu_0 != nullptr) {
|
||||
*size_tracker_on_gpu_0 += size_tracker_on_gpu_i;
|
||||
*size_tracker_on_gpu_0 = size_tracker_on_gpu_i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template void multi_gpu_alloc_lwe_async<__uint128_t>(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, std::vector<__uint128_t *> &dest, uint32_t num_inputs,
|
||||
uint32_t lwe_size, uint64_t *size_tracker_on_gpu_0,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
/// Allocates the input/output vector for all devices
|
||||
/// Initializes also the related indexing and initializes it to the trivial
|
||||
/// index
|
||||
@@ -86,7 +80,7 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
|
||||
gpu_indexes[i], &size_tracker, allocate_gpu_memory);
|
||||
dest[i] = d_array;
|
||||
if (i == 0 && size_tracker_on_gpu_0 != nullptr) {
|
||||
*size_tracker_on_gpu_0 += size_tracker;
|
||||
*size_tracker_on_gpu_0 = size_tracker;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -225,9 +219,5 @@ void multi_gpu_release_async(cudaStream_t const *streams,
|
||||
for (uint i = 0; i < vec.size(); i++)
|
||||
cuda_drop_async(vec[i], streams[i], gpu_indexes[i]);
|
||||
}
|
||||
template void
|
||||
multi_gpu_release_async<__uint128_t>(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
std::vector<__uint128_t *> &vec);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
#include "helper_profile.cuh"
|
||||
#include <stdint.h>
|
||||
|
||||
uint32_t adler32(const unsigned char *data) {
|
||||
const uint32_t MOD_ADLER = 65521;
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
#ifndef HELPER_PROFILE
|
||||
#define HELPER_PROFILE
|
||||
|
||||
#ifdef USE_NVTOOLS
|
||||
#include <nvToolsExt.h>
|
||||
#endif
|
||||
|
||||
void cuda_nvtx_label_with_color(const char *name);
|
||||
void cuda_nvtx_pop();
|
||||
|
||||
@@ -1007,7 +1007,6 @@ unsafe extern "C" {
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
reduce_degrees_for_single_carry_propagation: bool,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
@@ -1051,7 +1050,6 @@ unsafe extern "C" {
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
num_scalar_bits: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
@@ -1326,211 +1324,6 @@ unsafe extern "C" {
|
||||
gpu_indexes: *const u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn trim_radix_blocks_lsb_64(
|
||||
output: *mut CudaRadixCiphertextFFI,
|
||||
input: *const CudaRadixCiphertextFFI,
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_apply_noise_squashing_kb(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
input_glwe_dimension: u32,
|
||||
input_polynomial_size: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_radix_blocks: u32,
|
||||
num_original_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_apply_noise_squashing_kb(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
output_radix_lwe: *mut CudaRadixCiphertextFFI,
|
||||
input_radix_lwe: *const CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_apply_noise_squashing_kb(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
requested_flag: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
lhs_array: *mut CudaRadixCiphertextFFI,
|
||||
rhs_array: *const CudaRadixCiphertextFFI,
|
||||
carry_out: *mut CudaRadixCiphertextFFI,
|
||||
carry_in: *const CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
requested_flag: u32,
|
||||
uses_carry: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_sub_and_propagate_single_carry(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
is_divisor_power_of_two: bool,
|
||||
log2_divisor_exceeds_threshold: bool,
|
||||
multiplier_exceeds_threshold: bool,
|
||||
num_scalar_bits: u32,
|
||||
ilog2_divisor: u32,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
numerator_ct: *mut CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
decomposed_scalar: *const u64,
|
||||
has_at_least_one_set: *const u64,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
num_scalars: u32,
|
||||
multiplier_exceeds_threshold: bool,
|
||||
is_divisor_power_of_two: bool,
|
||||
log2_divisor_exceeds_threshold: bool,
|
||||
ilog2_divisor: u32,
|
||||
shift_pre: u64,
|
||||
shift_post: u32,
|
||||
rhs: u64,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_extend_radix_with_sign_msb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
num_additional_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_extend_radix_with_sign_msb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
output: *mut CudaRadixCiphertextFFI,
|
||||
input: *const CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
num_additional_blocks: u32,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_extend_radix_with_sign_msb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
|
||||
pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
|
||||
pub type KS_TYPE = ffi::c_uint;
|
||||
@@ -2079,7 +1872,7 @@ unsafe extern "C" {
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
bootstrapping_key: *const ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
ms_noise_reduction_ptr: *const ffi::c_void,
|
||||
ms_noise_reduction_ptr: *mut ffi::c_void,
|
||||
buffer: *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
|
||||
@@ -23,7 +23,6 @@ extern "C" {
|
||||
|
||||
pub fn cuda_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void;
|
||||
pub fn cuda_check_valid_malloc(size: u64, gpu_index: u32) -> bool;
|
||||
pub fn cuda_device_total_memory(gpu_index: u32) -> u64;
|
||||
|
||||
pub fn cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
dest: *mut c_void,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tfhe-hpu-backend"
|
||||
version = "0.2.0"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "BSD-3-Clause-Clear"
|
||||
description = "HPU implementation on FPGA of TFHE-rs primitives."
|
||||
@@ -12,10 +12,10 @@ keywords = ["encryption", "fhe", "cryptography", "hardware", "fpga"]
|
||||
|
||||
[features]
|
||||
hw-xrt = []
|
||||
hw-v80 = ["bincode"]
|
||||
hw-v80 = []
|
||||
io-dump = ["num-traits"]
|
||||
rtl_graph = ["dot2"]
|
||||
utils = ["clap", "clap-num", "bitvec", "serde_json", "bincode", "serde_derive"]
|
||||
utils = ["clap", "clap-num", "bitvec", "serde_json"]
|
||||
|
||||
[build-dependencies]
|
||||
cxx-build = "1.0"
|
||||
@@ -52,7 +52,7 @@ ipc-channel = "0.18.3"
|
||||
num-traits = { version = "0.2", optional = true }
|
||||
clap = { version = "4.4.4", features = ["derive"], optional = true }
|
||||
clap-num = { version = "1.1.1", optional = true }
|
||||
nix = { version = "0.29.0", features = ["ioctl", "uio", "fs"] }
|
||||
nix = { version = "0.29.0", features = ["ioctl", "uio"] }
|
||||
|
||||
# Dependencies used for rtl_graph features
|
||||
dot2 = { version = "1.0", optional = true }
|
||||
@@ -60,10 +60,6 @@ dot2 = { version = "1.0", optional = true }
|
||||
bitvec = { version = "1.0", optional = true }
|
||||
serde_json = { version = "1.0", optional = true }
|
||||
|
||||
# Dependencies used for v80 pdi handling
|
||||
bincode ={ version = "1.3", optional = true}
|
||||
serde_derive ={ version = "1.0", optional = true}
|
||||
|
||||
# Binary for manual debugging
|
||||
# Enable to access Hpu register and drive some custom sequence by hand
|
||||
[[bin]]
|
||||
@@ -71,11 +67,6 @@ name = "hputil"
|
||||
path = "src/utils/hputil.rs"
|
||||
required-features = ["utils"]
|
||||
|
||||
[[bin]]
|
||||
name = "hpu_archive_mgmt"
|
||||
path = "src/utils/hpu_archive_mgmt.rs"
|
||||
required-features = ["utils", "hw-v80"]
|
||||
|
||||
# Binary for asm manipulation
|
||||
# Enable to convert back and forth between asm/hex format
|
||||
[[bin]]
|
||||
|
||||
@@ -200,7 +200,7 @@ There are some example applications already available in `tfhe/examples/hpu`:
|
||||
|
||||
In order to run those applications on hardware, user must build from the project root (i.e `tfhe-rs-internal`) with `hpu-v80` features:
|
||||
|
||||
> NB: Running examples required to have correctly pulled the `.hpu` files. Those files, due to their size, are backed by git-lfs and disabled by default.
|
||||
> NB: Running examples required to have correctly pulled the `.pdi` files. Those files, due to their size, are backed by git-lfs and disabled by default.
|
||||
> In order to retrieve them, use the following command:
|
||||
> ```bash
|
||||
> git lfs pull --include="*" --exclude=""
|
||||
@@ -209,18 +209,11 @@ In order to run those applications on hardware, user must build from the project
|
||||
``` bash
|
||||
cargo build --release --features="hpu-v80" --example hpu_hlapi --example hpu_bench
|
||||
# Correctly setup environment with setup_hpu.sh script
|
||||
source setup_hpu.sh --config v80
|
||||
source setup_hpu.sh --config v80 --init-qdma
|
||||
./target/release/examples/hpu_bench --integer-w 64 --integer-w 32 --iop MUL --iter 10
|
||||
./target/release/examples/hpu_hlapi
|
||||
```
|
||||
|
||||
> NB: Error that occurred when ".hpu" files weren't correctly fetch could be a bit enigmatic: `memory allocation of ... bytes failed`
|
||||
> If you encountered this issue, you should run the following command:
|
||||
> ```bash
|
||||
> git lfs pull --include="*" --exclude=""
|
||||
> ```
|
||||
|
||||
|
||||
## Test framework
|
||||
There is also a set of tests backed in tfhe-rs. Tests are gather in testbundle over various integer width.
|
||||
Those tests have 5 sub-kind:
|
||||
|
||||
@@ -49,8 +49,7 @@ offset= 0x10
|
||||
owner="Parameter"
|
||||
read_access="Read"
|
||||
write_access="None"
|
||||
field.major={size_b=4, default={Param="VERSION_MAJOR"}, description="RTL major version"}
|
||||
field.minor={size_b=4, default={Param="VERSION_MINOR"}, description="RTL minor version"}
|
||||
default={Param="VERSION"}
|
||||
|
||||
[section.info.register.ntt_architecture]
|
||||
description="NTT architecture"
|
||||
@@ -255,15 +254,3 @@ description="BPIP configuration"
|
||||
read_access="Read"
|
||||
write_access="Write"
|
||||
default={Cst=0xffffffff}
|
||||
|
||||
# =====================================================================================================================
|
||||
[section.keyswitch]
|
||||
offset= 0x3000
|
||||
description="Keyswitch Configuration"
|
||||
|
||||
[section.keyswitch.register.config]
|
||||
description="(1) Use use modulus switching mean compensation. (default), (0) Don't use modulus switching mean compensation."
|
||||
owner="User"
|
||||
read_access="Read"
|
||||
write_access="Write"
|
||||
field.mod_switch_mean_comp = { size_b=1, offset_b=0 , default={Cst=1}, description="Controls whether to use modulus switch mean compensation, aka. Mayeul's Trick."}
|
||||
|
||||
@@ -6,9 +6,7 @@
|
||||
"${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_3in3.toml"]
|
||||
polling_us=10
|
||||
[fpga.ffi.V80]
|
||||
id= 0
|
||||
hpu_path="${HPU_BACKEND_DIR}/config_store/v80_archives/psi64.hpu"
|
||||
ami_path="${AMI_PATH}/ami.ko"
|
||||
ami_id=1 # First ami device in the list
|
||||
qdma_h2c="/dev/qdma${V80_PCIE_DEV}001-MM-1"
|
||||
qdma_c2h="/dev/qdma${V80_PCIE_DEV}001-MM-2"
|
||||
|
||||
|
||||
@@ -49,8 +49,7 @@ offset= 0x10
|
||||
owner="Parameter"
|
||||
read_access="Read"
|
||||
write_access="None"
|
||||
field.major={size_b=4, default={Param="VERSION_MAJOR"}, description="RTL major version"}
|
||||
field.minor={size_b=4, default={Param="VERSION_MINOR"}, description="RTL minor version"}
|
||||
default={Param="VERSION"}
|
||||
|
||||
[section.info.register.ntt_architecture]
|
||||
description="NTT architecture"
|
||||
@@ -255,15 +254,3 @@ description="BPIP configuration"
|
||||
read_access="Read"
|
||||
write_access="Write"
|
||||
default={Cst=0xffffffff}
|
||||
|
||||
# =====================================================================================================================
|
||||
[section.keyswitch]
|
||||
offset= 0x3000
|
||||
description="Keyswitch Configuration"
|
||||
|
||||
[section.keyswitch.register.config]
|
||||
description="(1) Use use modulus switching mean compensation. (default), (0) Don't use modulus switching mean compensation."
|
||||
owner="User"
|
||||
read_access="Read"
|
||||
write_access="Write"
|
||||
field.mod_switch_mean_comp = { size_b=1, offset_b=0 , default={Cst=1}, description="Controls whether to use modulus switch mean compensation, aka. Mayeul's Trick."}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user